From 6d6a4360876f1e758e215570ccb04518db7cec3a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: mm: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- mm/allocpercpu.c | 4 ++-- mm/vmstat.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index f4026bae6eed..b5411c2c47d4 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(percpu_depopulate); void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) { int cpu; - for_each_cpu_mask(cpu, *mask) + for_each_cpu_mask_nr(cpu, *mask) percpu_depopulate(__pdata, cpu); } EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); @@ -86,7 +86,7 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, int cpu; cpus_clear(populated); - for_each_cpu_mask(cpu, *mask) + for_each_cpu_mask_nr(cpu, *mask) if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { __percpu_depopulate_mask(__pdata, &populated); return -ENOMEM; diff --git a/mm/vmstat.c b/mm/vmstat.c index db9eabb2c5b3..c3d4a781802f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -26,7 +26,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu_mask(cpu, *cpumask) { + for_each_cpu_mask_nr(cpu, *cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) -- cgit v1.2.2 From 6b74ab97bc12ce74acec900f1d89a4aee2e4d70d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:26:49 -0700 Subject: mm: add a basic debugging framework for memory initialisation Boot initialisation is very complex, with significant numbers of architecture-specific routines, hooks and code ordering. While significant amounts of the initialisation is architecture-independent, it trusts the data received from the architecture layer. This is a mistake, and has resulted in a number of difficult-to-diagnose bugs. This patchset adds some validation and tracing to memory initialisation. It also introduces a few basic defensive measures. The validation code can be explicitly disabled for embedded systems. This patch: Add additional debugging and verification code for memory initialisation. Once enabled, the verification checks are always run and when required additional debugging information may be outputted via a mminit_loglevel= command-line parameter. The verification code is placed in a new file mm/mm_init.c. Ideally other mm initialisation code will be moved here over time. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 1 + mm/internal.h | 27 +++++++++++++++++++++++++++ mm/mm_init.c | 18 ++++++++++++++++++ mm/page_alloc.c | 22 +++++++++++++--------- 4 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 mm/mm_init.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 18c143b3c46c..4bbc8f094ff0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_DEBUG_MEMORY_INIT) += mm_init.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/internal.h b/mm/internal.h index 0034e947e4bc..a7ee05253294 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -59,4 +59,31 @@ static inline unsigned long page_order(struct page *page) #define __paginginit __init #endif +/* Memory initialisation debug and verification */ +enum mminit_level { + MMINIT_WARNING, + MMINIT_VERIFY, + MMINIT_TRACE +}; + +#ifdef CONFIG_DEBUG_MEMORY_INIT + +extern int mminit_loglevel; + +#define mminit_dprintk(level, prefix, fmt, arg...) \ +do { \ + if (level < mminit_loglevel) { \ + printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ + printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ + } \ +} while (0) + +#else + +static inline void mminit_dprintk(enum mminit_level level, + const char *prefix, const char *fmt, ...) +{ +} + +#endif /* CONFIG_DEBUG_MEMORY_INIT */ #endif diff --git a/mm/mm_init.c b/mm/mm_init.c new file mode 100644 index 000000000000..c01d8dfec817 --- /dev/null +++ b/mm/mm_init.c @@ -0,0 +1,18 @@ +/* + * mm_init.c - Memory initialisation verification and debugging + * + * Copyright 2008 IBM Corporation, 2008 + * Author Mel Gorman + * + */ +#include +#include + +int __meminitdata mminit_loglevel; + +static __init int set_mminit_loglevel(char *str) +{ + get_option(&str, &mminit_loglevel); + return 0; +} +early_param("mminit_loglevel", set_mminit_loglevel); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79ac4afc908c..0908352ba727 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2975,7 +2975,8 @@ void __init sparse_memory_present_with_active_regions(int nid) void __init push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn) { - printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", + mminit_dprintk(MMINIT_TRACE, "zoneboundary", + "Entering push_node_boundaries(%u, %lu, %lu)\n", nid, start_pfn, end_pfn); /* Initialise the boundary for this node if necessary */ @@ -2993,7 +2994,8 @@ void __init push_node_boundaries(unsigned int nid, static void __meminit account_node_boundary(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { - printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", + mminit_dprintk(MMINIT_TRACE, "zoneboundary", + "Entering account_node_boundary(%u, %lu, %lu)\n", nid, *start_pfn, *end_pfn); /* Return if boundary information has not been provided */ @@ -3368,8 +3370,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "%s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING @@ -3379,7 +3381,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; - printk(KERN_DEBUG " %s zone: %lu pages reserved\n", + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "%s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } @@ -3520,10 +3523,11 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " - "%d entries of %d used\n", - nid, start_pfn, end_pfn, - nr_nodemap_entries, MAX_ACTIVE_REGIONS); + mminit_dprintk(MMINIT_TRACE, "memory_register", + "Entering add_active_range(%d, %#lx, %#lx) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); /* Merge with existing active regions if possible */ for (i = 0; i < nr_nodemap_entries; i++) { -- cgit v1.2.2 From 708614e6180f398cd307ea0048d48ba6fa274610 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:26:51 -0700 Subject: mm: verify the page links and memory model Print out information on how the page flags are being used if mminit_loglevel is MMINIT_VERIFY or higher and unconditionally performs sanity checks on the flags regardless of loglevel. When the page flags are updated with section, node and zone information, a check are made to ensure the values can be retrieved correctly. Finally we confirm that pfn_to_page and page_to_pfn are the correct inverse functions. [akpm@linux-foundation.org: fix printk warnings] Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 12 ++++++++++ mm/mm_init.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 8 +++++++ 3 files changed, 91 insertions(+) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index a7ee05253294..7a4a2885dc8e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -78,6 +78,10 @@ do { \ } \ } while (0) +extern void mminit_verify_pageflags_layout(void); +extern void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn); + #else static inline void mminit_dprintk(enum mminit_level level, @@ -85,5 +89,13 @@ static inline void mminit_dprintk(enum mminit_level level, { } +static inline void mminit_verify_pageflags_layout(void) +{ +} + +static inline void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn) +{ +} #endif /* CONFIG_DEBUG_MEMORY_INIT */ #endif diff --git a/mm/mm_init.c b/mm/mm_init.c index c01d8dfec817..e16990d629e6 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -7,9 +7,80 @@ */ #include #include +#include "internal.h" int __meminitdata mminit_loglevel; +void __init mminit_verify_pageflags_layout(void) +{ + int shift, width; + unsigned long or_mask, add_mask; + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", + "Section %d Node %d Zone %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d\n", +#ifdef SECTIONS_SHIFT + SECTIONS_SHIFT, +#else + 0, +#endif + NODES_SHIFT, + ZONES_SHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", + "Section %lu Node %lu Zone %lu\n", + (unsigned long)SECTIONS_PGSHIFT, + (unsigned long)NODES_PGSHIFT, + (unsigned long)ZONES_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", + "Zone ID: %lu -> %lu\n", + (unsigned long)ZONEID_PGOFF, + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", + "location: %d -> %d unused %d -> %d flags %d -> %d\n", + shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); +#ifdef NODE_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Node not in page flags"); +#endif + + if (SECTIONS_WIDTH) { + shift -= SECTIONS_WIDTH; + BUG_ON(shift != SECTIONS_PGSHIFT); + } + if (NODES_WIDTH) { + shift -= NODES_WIDTH; + BUG_ON(shift != NODES_PGSHIFT); + } + if (ZONES_WIDTH) { + shift -= ZONES_WIDTH; + BUG_ON(shift != ZONES_PGSHIFT); + } + + /* Check for bitmask overlaps */ + or_mask = (ZONES_MASK << ZONES_PGSHIFT) | + (NODES_MASK << NODES_PGSHIFT) | + (SECTIONS_MASK << SECTIONS_PGSHIFT); + add_mask = (ZONES_MASK << ZONES_PGSHIFT) + + (NODES_MASK << NODES_PGSHIFT) + + (SECTIONS_MASK << SECTIONS_PGSHIFT); + BUG_ON(or_mask != add_mask); +} + +void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, + unsigned long nid, unsigned long pfn) +{ + BUG_ON(page_to_nid(page) != nid); + BUG_ON(page_zonenum(page) != zone); + BUG_ON(page_to_pfn(page) != pfn); +} + static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0908352ba727..acab6ad326df 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2534,6 +2534,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); + mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); @@ -2836,6 +2837,12 @@ __meminit int init_currently_empty_zone(struct zone *zone, zone->zone_start_pfn = zone_start_pfn; + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "Initialising map node %d zone %lu pfns %lu -> %lu\n", + pgdat->node_id, + (unsigned long)zone_idx(zone), + zone_start_pfn, (zone_start_pfn + size)); + zone_init_free_lists(zone); return 0; @@ -3961,6 +3968,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].end_pfn); /* Initialise every node */ + mminit_verify_pageflags_layout(); setup_nr_node_ids(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); -- cgit v1.2.2 From 2dbb51c49f4fecb8330e43247a0edfbc4b2b8974 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:26:52 -0700 Subject: mm: make defensive checks around PFN values registered for memory usage There are a number of different views to how much memory is currently active. There is the arch-independent zone-sizing view, the bootmem allocator and memory models view. Architectures register this information at different times and is not necessarily in sync particularly with respect to some SPARSEMEM limitations. This patch introduces mminit_validate_memmodel_limits() which is able to validate and correct PFN ranges with respect to the memory model. It is only SPARSEMEM that currently validates itself. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 1 + mm/internal.h | 12 ++++++++++++ mm/page_alloc.c | 2 ++ mm/sparse.c | 37 +++++++++++++++++++++++++++++-------- 4 files changed, 44 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 8d9f60e06f62..9f4bbc5da73f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -91,6 +91,7 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize; + mminit_validate_memmodel_limits(&start, &end); bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); bdata->node_boot_start = PFN_PHYS(start); bdata->node_low_pfn = end; diff --git a/mm/internal.h b/mm/internal.h index 7a4a2885dc8e..5d17f3efac41 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -98,4 +98,16 @@ static inline void mminit_verify_page_links(struct page *page, { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ + +/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ +#if defined(CONFIG_SPARSEMEM) +extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn); +#else +static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ +} +#endif /* CONFIG_SPARSEMEM */ + #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index acab6ad326df..0adb66e711e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3536,6 +3536,8 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); + /* Merge with existing active regions if possible */ for (i = 0; i < nr_nodemap_entries; i++) { if (early_node_map[i].nid != nid) diff --git a/mm/sparse.c b/mm/sparse.c index 36511c7b5e2c..7a3650923d9a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" /* * Permanent SPARSEMEM data: @@ -147,22 +148,41 @@ static inline int sparse_early_nid(struct mem_section *section) return (section->section_mem_map >> SECTION_NID_SHIFT); } -/* Record a memory area against a node. */ -void __init memory_present(int nid, unsigned long start, unsigned long end) +/* Validate the physical addressing limitations of the model */ +void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) { - unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); - unsigned long pfn; + unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); /* * Sanity checks - do not allow an architecture to pass * in larger pfns than the maximum scope of sparsemem: */ - if (start >= max_arch_pfn) - return; - if (end >= max_arch_pfn) - end = max_arch_pfn; + if (*start_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *start_pfn = max_sparsemem_pfn; + *end_pfn = max_sparsemem_pfn; + } + + if (*end_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *end_pfn = max_sparsemem_pfn; + } +} + +/* Record a memory area against a node. */ +void __init memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { unsigned long section = pfn_to_section_nr(pfn); struct mem_section *ms; @@ -187,6 +207,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, unsigned long pfn; unsigned long nr_pages = 0; + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { if (nid != early_pfn_to_nid(pfn)) continue; -- cgit v1.2.2 From 68ad8df42e12037c3894c9706ab428bf5cd6426b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:26:52 -0700 Subject: mm: print out the zonelists on request for manual verification This patch prints out the zonelists during boot for manual verification by the user if the mminit_loglevel is MMINIT_VERIFY or higher. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +++++ mm/mm_init.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 1 + 3 files changed, 51 insertions(+) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 5d17f3efac41..50807e12490e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -81,6 +81,7 @@ do { \ extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_page_links(struct page *page, enum zone_type zone, unsigned long nid, unsigned long pfn); +extern void mminit_verify_zonelist(void); #else @@ -97,6 +98,10 @@ static inline void mminit_verify_page_links(struct page *page, enum zone_type zone, unsigned long nid, unsigned long pfn) { } + +static inline void mminit_verify_zonelist(void) +{ +} #endif /* CONFIG_DEBUG_MEMORY_INIT */ /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ diff --git a/mm/mm_init.c b/mm/mm_init.c index e16990d629e6..ce445ca097e7 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -11,6 +11,51 @@ int __meminitdata mminit_loglevel; +/* The zonelists are simply reported, validation is manual. */ +void mminit_verify_zonelist(void) +{ + int nid; + + if (mminit_loglevel < MMINIT_VERIFY) + return; + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + struct zoneref *z; + struct zonelist *zonelist; + int i, listid, zoneid; + + BUG_ON(MAX_ZONELISTS > 2); + for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { + + /* Identify the zone and nodelist */ + zoneid = i % MAX_NR_ZONES; + listid = i / MAX_NR_ZONES; + zonelist = &pgdat->node_zonelists[listid]; + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Print information about the zonelist */ + printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", + listid > 0 ? "thisnode" : "general", nid, + zone->name); + + /* Iterate the zonelist */ + for_each_zone_zonelist(zone, z, zonelist, zoneid) { +#ifdef CONFIG_NUMA + printk(KERN_CONT "%d:%s ", + zone->node, zone->name); +#else + printk(KERN_CONT "0:%s ", zone->name); +#endif /* CONFIG_NUMA */ + } + printk(KERN_CONT "\n"); + } + } +} + void __init mminit_verify_pageflags_layout(void) { int shift, width; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0adb66e711e6..9ece07ce65b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2352,6 +2352,7 @@ void build_all_zonelists(void) if (system_state == SYSTEM_BOOTING) { __build_all_zonelists(NULL); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guarantee there is no user -- cgit v1.2.2 From b61bfa3c462671c48a51fb5c31af337c5a996a04 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:26:55 -0700 Subject: mm: move bootmem descriptors definition to a single place There are a lot of places that define either a single bootmem descriptor or an array of them. Use only one central array with MAX_NUMNODES items instead. Signed-off-by: Johannes Weiner Acked-by: Ralf Baechle Cc: Ingo Molnar Cc: Richard Henderson Cc: Russell King Cc: Tony Luck Cc: Hirokazu Takata Cc: Geert Uytterhoeven Cc: Kyle McMartin Cc: Paul Mackerras Cc: Paul Mundt Cc: David S. Miller Cc: Yinghai Lu Cc: Christoph Lameter Cc: Mel Gorman Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 ++ mm/page_alloc.c | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 9f4bbc5da73f..35b3cb667036 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -36,6 +36,8 @@ static LIST_HEAD(bdata_list); unsigned long saved_max_pfn; #endif +bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; + /* return the number of _pages_ that will be allocated for the boot bitmap */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9ece07ce65b0..e089b92cdfff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4040,9 +4040,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -static bootmem_data_t contig_bootmem_data; -struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; - +struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; EXPORT_SYMBOL(contig_page_data); #endif -- cgit v1.2.2 From 6b312c0e6e2f44b020e12953d1dd37eed60e3609 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:26:58 -0700 Subject: mm: fix free_all_bootmem_core alignment check The check for node_boot_start is bogus because we start freeing at the corresponding pfn. So check if the pfn is properly aligned instead in a more readable way and adjust the documentation. Also remove an unneeded accounting variable. Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Christoph Lameter Cc: Mel Gorman Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 35b3cb667036..319a79bce7cd 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -377,7 +377,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) struct page *page; unsigned long pfn; bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; + unsigned long i, count; unsigned long idx; unsigned long *map; int gofast = 0; @@ -389,10 +389,13 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) pfn = PFN_DOWN(bdata->node_boot_start); idx = bdata->node_low_pfn - pfn; map = bdata->node_bootmem_map; - /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ - if (bdata->node_boot_start == 0 || - ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG)) + /* + * Check if we are aligned to BITS_PER_LONG pages. If so, we might + * be able to free page orders of that size at once. + */ + if (!(pfn & (BITS_PER_LONG-1))) gofast = 1; + for (i = 0; i < idx; ) { unsigned long v = ~map[i / BITS_PER_LONG]; @@ -420,23 +423,19 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) } pfn += BITS_PER_LONG; } - total += count; /* * Now free the allocator bitmap itself, it's not * needed anymore: */ page = virt_to_page(bdata->node_bootmem_map); - count = 0; idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; - for (i = 0; i < idx; i++, page++) { + for (i = 0; i < idx; i++, page++) __free_pages_bootmem(page, 0); - count++; - } - total += count; + count += i; bdata->node_bootmem_map = NULL; - return total; + return count; } unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, -- cgit v1.2.2 From 8ae04463077324ed9f6b04ab3a5b17ae1ee4dd35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:26:59 -0700 Subject: mm: normalize internal argument passing of bootmem data All _core functions only need the bootmem data, not the whole node descriptor. Adjust the two functions that take the node descriptor unneededly. Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Christoph Lameter Cc: Mel Gorman Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 319a79bce7cd..251c66c5d96a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -87,10 +87,9 @@ static unsigned long __init get_mapsize(bootmem_data_t *bdata) /* * Called once to set up the allocator itself. */ -static unsigned long __init init_bootmem_core(pg_data_t *pgdat, +static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, unsigned long mapstart, unsigned long start, unsigned long end) { - bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize; mminit_validate_memmodel_limits(&start, &end); @@ -372,11 +371,10 @@ found: return ret; } -static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; unsigned long pfn; - bootmem_data_t *bdata = pgdat->bdata; unsigned long i, count; unsigned long idx; unsigned long *map; @@ -441,7 +439,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) { - return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); + return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); } int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, @@ -466,14 +464,14 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); - return free_all_bootmem_core(pgdat); + return free_all_bootmem_core(pgdat->bdata); } unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; - return init_bootmem_core(NODE_DATA(0), start, 0, pages); + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE @@ -504,7 +502,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) unsigned long __init free_all_bootmem(void) { - return free_all_bootmem_core(NODE_DATA(0)); + return free_all_bootmem_core(NODE_DATA(0)->bdata); } void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, -- cgit v1.2.2 From ffc6421f0720f433b5b35b89ff56e998eabff93b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:26:59 -0700 Subject: mm: unexport __alloc_bootmem_core() This function has no external callers, so unexport it. Also fix its naming inconsistency. Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Christoph Lameter Cc: Mel Gorman Cc: Andy Whitcroft Cc: Mel Gorman Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 251c66c5d96a..4bc6ae2fbaa3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -234,9 +234,9 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, * * NOTE: This function is _not_ reentrant. */ -void * __init -__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) +static void * __init +alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, + unsigned long align, unsigned long goal, unsigned long limit) { unsigned long areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; @@ -245,7 +245,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, void *node_bootmem_map; if (!size) { - printk("__alloc_bootmem_core(): zero-sized request\n"); + printk("alloc_bootmem_core(): zero-sized request\n"); BUG(); } BUG_ON(align & (align-1)); @@ -512,7 +512,7 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, void *ptr; list_for_each_entry(bdata, &bdata_list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); + ptr = alloc_bootmem_core(bdata, size, align, goal, 0); if (ptr) return ptr; } @@ -540,7 +540,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; @@ -559,8 +559,8 @@ void * __init alloc_bootmem_section(unsigned long size, goal = PFN_PHYS(pfn); limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; pgdat = NODE_DATA(early_pfn_to_nid(pfn)); - ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, - limit); + ptr = alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, + limit); if (!ptr) return NULL; @@ -589,8 +589,8 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void *ptr; list_for_each_entry(bdata, &bdata_list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); + ptr = alloc_bootmem_core(bdata, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); if (ptr) return ptr; } @@ -606,6 +606,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return __alloc_bootmem_core(pgdat->bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); + return alloc_bootmem_core(pgdat->bdata, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); } -- cgit v1.2.2 From e4048e5dc4aecec670f48ed007a28779f09cebd6 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 23 Jul 2008 21:27:01 -0700 Subject: page allocator: inline some __alloc_pages() wrappers Two zonelist patch series rewrote __page_alloc() largely. Now, it is just a wrapper function. Inlining them will save a function call. [akpm@linux-foundation.org: export __alloc_pages_internal] Cc: Lee Schermerhorn Cc: Mel Gorman Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e089b92cdfff..35b1347d81bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1429,7 +1429,7 @@ try_next_zone: /* * This is the 'heart' of the zoned buddy allocator. */ -static struct page * +struct page * __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { @@ -1632,22 +1632,7 @@ nopage: got_pg: return page; } - -struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); -} - -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); -} - -EXPORT_SYMBOL(__alloc_pages); +EXPORT_SYMBOL(__alloc_pages_internal); /* * Common helper functions. -- cgit v1.2.2 From 4f5ca265788973e3f5a1129a96ee4a9cbf587f2b Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:27:02 -0700 Subject: mm/migrate.c should #include Every file should include the headers containing the externs for its global functions (in this case for sys_move_pages()). Signed-off-by: Adrian Bunk Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 55bd355d170d..e7d13a708da0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "internal.h" -- cgit v1.2.2 From c748e1340e0de3fa7fed86f8bdf499be9242afff Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:27:03 -0700 Subject: mm/vmstat.c: proper externs This patch adds proper extern declarations for five variables in include/linux/vmstat.h Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index c3d4a781802f..b0d08e667ece 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #ifdef CONFIG_VM_EVENT_COUNTERS -- cgit v1.2.2 From 75353bed36cfbbfb55bbde0896bbf5a02d9ba355 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:27:03 -0700 Subject: mm/hugetlb.c: fix duplicate variable It's confusing that set_max_huge_pages() contained two different variables named "ret", and although the code works correctly this should be fixed. The inner of the two variables can simply be removed. Spotted by sparse. Signed-off-by: Adrian Bunk Cc: "KOSAKI Motohiro" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab171274ef21..2c5c9ee4220d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -603,7 +603,6 @@ static unsigned long set_max_huge_pages(unsigned long count) } while (count > persistent_huge_pages) { - int ret; /* * If this allocation races such that we no longer need the * page, free_huge_page will handle it by freeing the page -- cgit v1.2.2 From a969e903a944f69309ee5cc9e7c7b08310d1151e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jul 2008 21:27:04 -0700 Subject: kill generic_file_direct_IO() generic_file_direct_IO is a common helper around the invocation of ->direct_IO. But there's almost nothing shared between the read and write side, so we're better off without this helper. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 117 ++++++++++++++++++++++++++--------------------------------- 1 file changed, 51 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..6343f3c841b7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,9 +42,6 @@ #include -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -1205,8 +1202,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = generic_file_direct_IO(READ, iocb, - iov, pos, nr_segs); + retval = filemap_write_and_wait(mapping); + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } if (retval > 0) *ppos = pos + retval; } @@ -2004,11 +2004,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written; + size_t write_len; + pgoff_t end; if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); + /* + * Unmap all mmappings of the file up-front. + * + * This will cause any pte dirty bits to be propagated into the + * pageframes for the subsequent filemap_write_and_wait(). + */ + write_len = iov_length(iov, *nr_segs); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, pos, write_len, 0); + + written = filemap_write_and_wait(mapping); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (written) + goto out; + } + + written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + if (written > 0) { loff_t end = pos + written; if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -2024,6 +2068,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * i_mutex is held, which protects generic_osync_inode() from * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ +out: if ((written >= 0 || written == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); @@ -2511,66 +2556,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_aio_write); -/* - * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len; - pgoff_t end = 0; /* silence gcc */ - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = filemap_write_and_wait(mapping); - if (retval) - goto out; - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - if (rw == WRITE && mapping->nrpages) { - retval = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (retval) - goto out; - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - - /* - * Finally, try again to invalidate clean pages which might have been - * cached by non-direct readahead, or faulted in by get_user_pages() - * if the source of the write was an mmap'ed region of the file - * we're writing. Either one is a pretty crazy thing to do, - * so we don't support it 100%. If this invalidation - * fails, tough, the write still worked... - */ - if (rw == WRITE && mapping->nrpages) { - invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - } -out: - return retval; -} - /** * try_to_release_page() - release old fs-specific metadata on a page * -- cgit v1.2.2 From 0d71d10a4252a3938e6b70189bc776171c02e076 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: mm: remove nopfn There are no users of nopfn in the tree. Remove it. [hugh@veritas.com: fix build error] Signed-off-by: Nick Piggin Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 67 +++++++------------------------------------------------------ 1 file changed, 7 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..46dbed4b7446 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1058,11 +1058,9 @@ static inline int use_zero_page(struct vm_area_struct *vma) if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) return 0; /* - * And if we have a fault or a nopfn routine, it's not an - * anonymous region. + * And if we have a fault routine, it's not an anonymous region. */ - return !vma->vm_ops || - (!vma->vm_ops->fault && !vma->vm_ops->nopfn); + return !vma->vm_ops || !vma->vm_ops->fault; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -1338,6 +1336,11 @@ out: * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) @@ -2501,59 +2504,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } - -/* - * do_no_pfn() tries to create a new page mapping for a page without - * a struct_page backing it - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - * - * It is expected that the ->nopfn handler always returns the same pfn - * for a given virtual mapping. - * - * Mark this `noinline' to prevent it from bloating the main pagefault code. - */ -static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) -{ - spinlock_t *ptl; - pte_t entry; - unsigned long pfn; - - pte_unmap(page_table); - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); - BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - - pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); - - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); - - if (unlikely(pfn == NOPFN_OOM)) - return VM_FAULT_OOM; - else if (unlikely(pfn == NOPFN_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(pfn == NOPFN_REFAULT)) - return 0; - - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - entry = pfn_pte(pfn, vma->vm_page_prot); - if (write_access) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); - } - pte_unmap_unlock(page_table, ptl); - return 0; -} - /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable @@ -2614,9 +2564,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, write_access, entry); - if (unlikely(vma->vm_ops->nopfn)) - return do_no_pfn(mm, vma, address, pte, - pmd, write_access); } return do_anonymous_page(mm, vma, address, pte, pmd, write_access); -- cgit v1.2.2 From 28b2ee20c7cba812b6f2ccf6d722cf86d00a84dc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: access_process_vm device memory infrastructure In order to be able to debug things like the X server and programs using the PPC Cell SPUs, the debugger needs to be able to access device memory through ptrace and /proc/pid/mem. This patch: Add the generic_access_phys access function and put the hooks in place to allow access_process_vm to access device or PPC Cell SPU memory. [riel@redhat.com: Add documentation for the vm_ops->access function] Signed-off-by: Rik van Riel Signed-off-by: Benjamin Herrensmidt Cc: Dave Airlie Cc: Hugh Dickins Cc: Paul Mackerras Cc: Arnd Bergmann Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 46dbed4b7446..87350321e66f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2751,6 +2751,86 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_HAVE_IOREMAP_PROT +static resource_size_t follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + resource_size_t phys_addr = 0; + struct mm_struct *mm = vma->vm_mm; + + VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + phys_addr = pte_pfn(pte); + phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ + + *prot = pgprot_val(pte_pgprot(pte)); + +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return phys_addr; +no_page_table: + return 0; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + phys_addr = follow_phys(vma, addr, write, &prot); + + if (!phys_addr) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +#endif + /* * Access another process' address space. * Source/target buffer must be kernel space, @@ -2760,7 +2840,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in { struct mm_struct *mm; struct vm_area_struct *vma; - struct page *page; void *old_buf = buf; mm = get_task_mm(tsk); @@ -2772,28 +2851,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in while (len) { int bytes, ret, offset; void *maddr; + struct page *page = NULL; ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); } - kunmap(page); - page_cache_release(page); len -= bytes; buf += bytes; addr += bytes; -- cgit v1.2.2 From 42b7772812d15b86543a23b82bd6070eef9a08b1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 23 Jul 2008 21:27:10 -0700 Subject: mm: remove double indirection on tlb parameter to free_pgd_range() & Co The double indirection here is not needed anywhere and hence (at least) confusing. Signed-off-by: Jan Beulich Cc: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Luck, Tony" Cc: Paul Mundt Cc: "David S. Miller" Acked-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 +++ mm/memory.c | 10 ++++++---- mm/mmap.c | 6 ++++-- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 50807e12490e..858ad01864dc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,9 @@ #include +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); diff --git a/mm/memory.c b/mm/memory.c index 87350321e66f..82f3f1c5cf17 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -61,6 +61,8 @@ #include #include +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, * * Must be called with pagetable lock held. */ -void free_pgd_range(struct mmu_gather **tlb, +void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb, return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { diff --git a/mm/mmap.c b/mm/mmap.c index 1d102b956fd8..75e0d0673d78 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -32,6 +32,8 @@ #include #include +#include "internal.h" + #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif @@ -1763,7 +1765,7 @@ static void unmap_region(struct mm_struct *mm, update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); } @@ -2063,7 +2065,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); /* -- cgit v1.2.2 From 3c82d0ce2c4f642b2f24ef98707a030543b06b90 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:11 -0700 Subject: buddy: clarify comments describing buddy merge In __free_one_page(), the comment "Move the buddy up one level" appears attached to the break and by implication when the break is taken we are moving it up one level: if (!page_is_buddy(page, buddy, order)) break; /* Move the buddy up one level. */ In reality the inverse is true, we break out when we can no longer merge this page with its buddy. Looking back into pre-history (into the full git history) it appears that these two lines accidentally got joined as part of another change. Move the comment down where it belongs below the if and clarify its language. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 35b1347d81bb..24aa3d1b9d96 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page, buddy = __page_find_buddy(page, page_idx, order); if (!page_is_buddy(page, buddy, order)) - break; /* Move the buddy up one level. */ + break; + /* Our buddy is free, merge with it and move up one order. */ list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); -- cgit v1.2.2 From 8a38082d21cbc5ec961da7dda195e98a9a064dcf Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:18 -0700 Subject: slub: record page flag overlays explicitly SLUB reuses two page bits for internal purposes, it overlays PG_active and PG_error. This is hidden away in slub.c. Document these overlays explicitly in the main page-flags enum along with all the others. Signed-off-by: Andy Whitcroft Cc: Pekka Enberg Cc: Christoph Lameter Cc: Matt Mackall Cc: Nick Piggin Tested-by: KOSAKI Motohiro Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 65 +++++++++++++++++---------------------------------------------- 1 file changed, 17 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6d4a49c1ff2f..77c21cf53ff9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -102,44 +102,12 @@ * the fast path and disables lockless freelists. */ -#define FROZEN (1 << PG_active) - #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG (1 << PG_error) +#define SLABDEBUG 1 #else #define SLABDEBUG 0 #endif -static inline int SlabFrozen(struct page *page) -{ - return page->flags & FROZEN; -} - -static inline void SetSlabFrozen(struct page *page) -{ - page->flags |= FROZEN; -} - -static inline void ClearSlabFrozen(struct page *page) -{ - page->flags &= ~FROZEN; -} - -static inline int SlabDebug(struct page *page) -{ - return page->flags & SLABDEBUG; -} - -static inline void SetSlabDebug(struct page *page) -{ - page->flags |= SLABDEBUG; -} - -static inline void ClearSlabDebug(struct page *page) -{ - page->flags &= ~SLABDEBUG; -} - /* * Issues still to be resolved: * @@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && !page->freelist) + if (!PageSlubFrozen(page) && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); + __SetPageSlubDebug(page); start = page_address(page); @@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (unlikely(SlabDebug(page))) { + if (unlikely(SLABDEBUG && PageSlubDebug(page))) { void *p; slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) check_object(s, page, p, 0); - ClearSlabDebug(page); + __ClearPageSlubDebug(page); } mod_zone_page_state(page_zone(page), @@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, if (slab_trylock(page)) { list_del(&page->lru); n->nr_partial--; - SetSlabFrozen(page); + __SetPageSlubFrozen(page); return 1; } return 0; @@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) struct kmem_cache_node *n = get_node(s, page_to_nid(page)); struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); - ClearSlabFrozen(page); + __ClearPageSlubFrozen(page); if (page->inuse) { if (page->freelist) { @@ -1406,7 +1374,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { stat(c, DEACTIVATE_FULL); - if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) + if (SLABDEBUG && PageSlubDebug(page) && + (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); @@ -1551,7 +1520,7 @@ load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(c->page))) + if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; c->freelist = object[c->offset]; @@ -1588,7 +1557,7 @@ new_slab: if (c->page) flush_slab(s, c); slab_lock(new); - SetSlabFrozen(new); + __SetPageSlubFrozen(new); c->page = new; goto load_freelist; } @@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(c, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SlabDebug(page))) + if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; checks_ok: @@ -1682,7 +1651,7 @@ checks_ok: page->freelist = object; page->inuse--; - if (unlikely(SlabFrozen(page))) { + if (unlikely(PageSlubFrozen(page))) { stat(c, FREE_FROZEN); goto out_unlock; } @@ -3317,12 +3286,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, s->name, page); if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug not set " + if (!PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug not set " "on slab 0x%p\n", s->name, page); } else { - if (SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug set on " + if (PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug set on " "slab 0x%p\n", s->name, page); } } -- cgit v1.2.2 From 9023cb7e8564d95a1893f8cb6895a293be9a71fe Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:19 -0700 Subject: slob: record page flag overlays explicitly SLOB reuses two page bits for internal purposes, it overlays PG_active and PG_private. This is hidden away in slob.c. Document these overlays explicitly in the main page-flags enum along with all the others. Signed-off-by: Andy Whitcroft Cc: Pekka Enberg Cc: Christoph Lameter Cc: Matt Mackall Cc: Nick Piggin Reviewed-by: KOSAKI Motohiro Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index a3ad6671adf1..de268eb7ac70 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large); */ static inline int slob_page(struct slob_page *sp) { - return test_bit(PG_active, &sp->flags); + return PageSlobPage((struct page *)sp); } static inline void set_slob_page(struct slob_page *sp) { - __set_bit(PG_active, &sp->flags); + __SetPageSlobPage((struct page *)sp); } static inline void clear_slob_page(struct slob_page *sp) { - __clear_bit(PG_active, &sp->flags); + __ClearPageSlobPage((struct page *)sp); } /* @@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp) */ static inline int slob_page_free(struct slob_page *sp) { - return test_bit(PG_private, &sp->flags); + return PageSlobFree((struct page *)sp); } static void set_slob_page_free(struct slob_page *sp, struct list_head *list) { list_add(&sp->list, list); - __set_bit(PG_private, &sp->flags); + __SetPageSlobFree((struct page *)sp); } static inline void clear_slob_page_free(struct slob_page *sp) { list_del(&sp->list); - __clear_bit(PG_private, &sp->flags); + __ClearPageSlobFree((struct page *)sp); } #define SLOB_UNIT sizeof(slob_t) -- cgit v1.2.2 From 9109fb7b3520de187ebc3646c209d66a233f7169 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:27:20 -0700 Subject: mm: drop unneeded pgdat argument from free_area_init_node() free_area_init_node() gets passed in the node id as well as the node descriptor. This is redundant as the function can trivially get the node descriptor itself by means of NODE_DATA() and the node's id. I checked all the users and NODE_DATA() seems to be usable everywhere from where this function is called. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 833f854eabe5..6e26adc08f14 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -455,7 +455,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); + free_area_init_node(nid, zones_size, start_pfn, zholes_size); return pgdat; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 24aa3d1b9d96..e43aae135b38 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3461,10 +3461,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long node_start_pfn, - unsigned long *zholes_size) +void __paginginit free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) { + pg_data_t *pgdat = NODE_DATA(nid); + pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size); @@ -3961,7 +3962,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) setup_nr_node_ids(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, pgdat, NULL, + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ @@ -4032,7 +4033,7 @@ EXPORT_SYMBOL(contig_page_data); void __init free_area_init(unsigned long *zones_size) { - free_area_init_node(0, NODE_DATA(0), zones_size, + free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -- cgit v1.2.2 From fc1b8a73dd71226902a11928dd5500326e101df9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:22 -0700 Subject: hugetlb: move hugetlb_acct_memory() This is a patchset to give reliable behaviour to a process that successfully calls mmap(MAP_PRIVATE) on a hugetlbfs file. Currently, it is possible for the process to be killed due to a small hugepage pool size even if it calls mlock(). MAP_SHARED mappings on hugetlbfs reserve huge pages at mmap() time. This guarantees all future faults against the mapping will succeed. This allows local allocations at first use improving NUMA locality whilst retaining reliability. MAP_PRIVATE mappings do not reserve pages. This can result in an application being SIGKILLed later if a huge page is not available at fault time. This makes huge pages usage very ill-advised in some cases as the unexpected application failure cannot be detected and handled as it is immediately fatal. Although an application may force instantiation of the pages using mlock(), this may lead to poor memory placement and the process may still be killed when performing COW. This patchset introduces a reliability guarantee for the process which creates a private mapping, i.e. the process that calls mmap() on a hugetlbfs file successfully. The first patch of the set is purely mechanical code move to make later diffs easier to read. The second patch will guarantee faults up until the process calls fork(). After patch two, as long as the child keeps the mappings, the parent is no longer guaranteed to be reliable. Patch 3 guarantees that the parent will always successfully COW by unmapping the pages from the child in the event there are insufficient pages in the hugepage pool in allocate a new page, be it via a static or dynamic pool. Existing hugepage-aware applications are unlikely to be affected by this change. For much of hugetlbfs's history, pages were pre-faulted at mmap() time or mmap() failed which acts in a reserve-like manner. If the pool is sized correctly already so that parent and child can fault reliably, the application will not even notice the reserves. It's only when the pool is too small for the application to function perfectly reliably that the reserves come into play. Credit goes to Andy Whitcroft for cleaning up a number of mistakes during review before the patches were released. This patch: A later patch in this set needs to call hugetlb_acct_memory() before it is defined. This patch moves the function without modification. This makes later diffs easier to read. Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 82 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c5c9ee4220d..a4dbba8965f3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -716,6 +716,47 @@ unsigned long hugetlb_total_pages(void) return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); } +static int hugetlb_acct_memory(long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (delta > 0) { + if (gather_surplus_pages(delta) < 0) + goto out; + + if (delta > cpuset_mems_nr(free_huge_pages_node)) { + return_unused_surplus_pages(delta); + goto out; + } + } + + ret = 0; + if (delta < 0) + return_unused_surplus_pages((unsigned long) -delta); + +out: + spin_unlock(&hugetlb_lock); + return ret; +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -1248,47 +1289,6 @@ static long region_truncate(struct list_head *head, long end) return chg; } -static int hugetlb_acct_memory(long delta) -{ - int ret = -ENOMEM; - - spin_lock(&hugetlb_lock); - /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. - */ - if (delta > 0) { - if (gather_surplus_pages(delta) < 0) - goto out; - - if (delta > cpuset_mems_nr(free_huge_pages_node)) { - return_unused_surplus_pages(delta); - goto out; - } - } - - ret = 0; - if (delta < 0) - return_unused_surplus_pages((unsigned long) -delta); - -out: - spin_unlock(&hugetlb_lock); - return ret; -} - int hugetlb_reserve_pages(struct inode *inode, long from, long to) { long ret, chg; -- cgit v1.2.2 From a1e78772d72b2616ed20e54896e68e0e7044854e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:23 -0700 Subject: hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs mappings until fork() This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in a similar manner to the reservations taken for MAP_SHARED mappings. The reserve count is accounted both globally and on a per-VMA basis for private mappings. This guarantees that a process that successfully calls mmap() will successfully fault all pages in the future unless fork() is called. The characteristics of private mappings of hugetlbfs files behaviour after this patch are; 1. The process calling mmap() is guaranteed to succeed all future faults until it forks(). 2. On fork(), the parent may die due to SIGKILL on writes to the private mapping if enough pages are not available for the COW. For reasonably reliable behaviour in the face of a small huge page pool, children of hugepage-aware processes should not reference the mappings; such as might occur when fork()ing to exec(). 3. On fork(), the child VMAs inherit no reserves. Reads on pages already faulted by the parent will succeed. Successful writes will depend on enough huge pages being free in the pool. 4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper and at fault time otherwise. Before this patch, all reads or writes in the child potentially needs page allocations that can later lead to the death of the parent. This applies to reads and writes of uninstantiated pages as well as COW. After the patch it is only a write to an instantiated page that causes problems. Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 158 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a4dbba8965f3..0af500db3632 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,69 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + */ +static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + return (unsigned long)vma->vm_private_data; + return 0; +} + +static void set_vma_resv_huge_pages(struct vm_area_struct *vma, + unsigned long reserve) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + vma->vm_private_data = (void *)reserve; +} + +/* Decrement the reserved pages in the hugepage pool by one */ +static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) { + /* Shared mappings always use reserves */ + resv_huge_pages--; + } else { + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + if (vma_resv_huge_pages(vma)) { + resv_huge_pages--; + reserve = (unsigned long)vma->vm_private_data - 1; + vma->vm_private_data = (void *)reserve; + } + } +} + +void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + vma->vm_private_data = (void *)0; +} + +/* Returns true if the VMA has associated reserve pages */ +static int vma_has_private_reserves(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return 0; + if (!vma_resv_huge_pages(vma)) + return 0; + return 1; +} + static void clear_huge_page(struct page *page, unsigned long addr) { int i; @@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, struct zone *zone; struct zoneref *z; + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_private_reserves(vma) && + free_huge_pages - resv_huge_pages == 0) + return NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); @@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, list_del(&page->lru); free_huge_pages--; free_huge_pages_node[nid]--; - if (vma && vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; + decrement_hugepage_resv_vma(vma); + break; } } @@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) } } - -static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, - unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned int chg = 0; + + /* + * Processes that did not create the mapping will have no reserves and + * will not have accounted against quota. Check that the quota can be + * made before satisfying the allocation + */ + if (!vma_has_private_reserves(vma)) { + chg = 1; + if (hugetlb_get_quota(inode->i_mapping, chg)) + return ERR_PTR(-ENOSPC); + } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(vma, addr); spin_unlock(&hugetlb_lock); - return page ? page : ERR_PTR(-VM_FAULT_OOM); -} -static struct page *alloc_huge_page_private(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page = NULL; - - if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) - return ERR_PTR(-VM_FAULT_SIGBUS); - - spin_lock(&hugetlb_lock); - if (free_huge_pages > resv_huge_pages) - page = dequeue_huge_page_vma(vma, addr); - spin_unlock(&hugetlb_lock); if (!page) { page = alloc_buddy_huge_page(vma, addr); if (!page) { - hugetlb_put_quota(vma->vm_file->f_mapping, 1); + hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_OOM); } } - return page; -} -static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; - - if (vma->vm_flags & VM_MAYSHARE) - page = alloc_huge_page_shared(vma, addr); - else - page = alloc_huge_page_private(vma, addr); + set_page_refcounted(page); + set_page_private(page, (unsigned long) mapping); - if (!IS_ERR(page)) { - set_page_refcounted(page); - set_page_private(page, (unsigned long) mapping); - } return page; } @@ -757,6 +814,13 @@ out: return ret; } +static void hugetlb_vm_op_close(struct vm_area_struct *vma) +{ + unsigned long reserve = vma_resv_huge_pages(vma); + if (reserve) + hugetlb_acct_memory(-reserve); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, + .close = hugetlb_vm_op_close, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end) return chg; } -int hugetlb_reserve_pages(struct inode *inode, long from, long to) +int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma) { long ret, chg; - chg = region_chg(&inode->i_mapping->private_list, from, to); + /* + * Shared mappings base their reservation on the number of pages that + * are already allocated on behalf of the file. Private mappings need + * to reserve the full area even if read-only as mprotect() may be + * called to make the mapping read-write. Assume !vma is a shm mapping + */ + if (!vma || vma->vm_flags & VM_SHARED) + chg = region_chg(&inode->i_mapping->private_list, from, to); + else { + chg = to - from; + set_vma_resv_huge_pages(vma, chg); + } + if (chg < 0) return chg; @@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) hugetlb_put_quota(inode->i_mapping, chg); return ret; } - region_add(&inode->i_mapping->private_list, from, to); + if (!vma || vma->vm_flags & VM_SHARED) + region_add(&inode->i_mapping->private_list, from, to); return 0; } -- cgit v1.2.2 From 04f2cbe35699d22dbf428373682ead85ca1240f5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:25 -0700 Subject: hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed After patch 2 in this series, a process that successfully calls mmap() for a MAP_PRIVATE mapping will be guaranteed to successfully fault until a process calls fork(). At that point, the next write fault from the parent could fail due to COW if the child still has a reference. We only reserve pages for the parent but a copy must be made to avoid leaking data from the parent to the child after fork(). Reserves could be taken for both parent and child at fork time to guarantee faults but if the mapping is large it is highly likely we will not have sufficient pages for the reservation, and it is common to fork only to exec() immediatly after. A failure here would be very undesirable. Note that the current behaviour of mainline with MAP_PRIVATE pages is pretty bad. The following situation is allowed to occur today. 1. Process calls mmap(MAP_PRIVATE) 2. Process calls mlock() to fault all pages and makes sure it succeeds 3. Process forks() 4. Process writes to MAP_PRIVATE mapping while child still exists 5. If the COW fails at this point, the process gets SIGKILLed even though it had taken care to ensure the pages existed This patch improves the situation by guaranteeing the reliability of the process that successfully calls mmap(). When the parent performs COW, it will try to satisfy the allocation without using reserves. If that fails the parent will steal the page leaving any children without a page. Faults from the child after that point will result in failure. If the child COW happens first, an attempt will be made to allocate the page without reserves and the child will get SIGKILLed on failure. To summarise the new behaviour: 1. If the original mapper performs COW on a private mapping with multiple references, it will attempt to allocate a hugepage from the pool or the buddy allocator without using the existing reserves. On fail, VMAs mapping the same area are traversed and the page being COW'd is unmapped where found. It will then steal the original page as the last mapper in the normal way. 2. The VMAs the pages were unmapped from are flagged to note that pages with data no longer exist. Future no-page faults on those VMAs will terminate the process as otherwise it would appear that data was corrupted. A warning is printed to the console that this situation occured. 2. If the child performs COW first, it will attempt to satisfy the COW from the pool if there are enough pages or via the buddy allocator if overcommit is allowed and the buddy allocator can satisfy the request. If it fails, the child will be killed. If the pool is large enough, existing applications will not notice that the reserves were a factor. Existing applications depending on the no-reserves been set are unlikely to exist as for much of the history of hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that point or failing the mmap(). [npiggin@suse.de: fix CONFIG_HUGETLB=n build] Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ mm/memory.c | 2 +- 2 files changed, 184 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0af500db3632..a2d29b84501f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,9 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) +#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() @@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return (unsigned long)vma->vm_private_data; + return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK; return 0; } static void set_vma_resv_huge_pages(struct vm_area_struct *vma, unsigned long reserve) { + unsigned long flags; VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - vma->vm_private_data = (void *)reserve; + flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK; + vma->vm_private_data = (void *)(reserve | flags); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + unsigned long reserveflags = (unsigned long)vma->vm_private_data; + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + vma->vm_private_data = (void *)(reserveflags | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + return ((unsigned long)vma->vm_private_data & flag) != 0; } /* Decrement the reserved pages in the hugepage pool by one */ @@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) * Only the process that called mmap() has reserves for * private mappings. */ - if (vma_resv_huge_pages(vma)) { + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + unsigned long flags, reserve; resv_huge_pages--; + flags = (unsigned long)vma->vm_private_data & + HPAGE_RESV_MASK; reserve = (unsigned long)vma->vm_private_data - 1; - vma->vm_private_data = (void *)reserve; + vma->vm_private_data = (void *)(reserve | flags); } } } +/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); @@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void) } static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, - unsigned long address) + unsigned long address, int avoid_reserve) { int nid; struct page *page = NULL; @@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, free_huge_pages - resv_huge_pages == 0) return NULL; + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) + return NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); @@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, list_del(&page->lru); free_huge_pages--; free_huge_pages_node[nid]--; - decrement_hugepage_resv_vma(vma); + + if (!avoid_reserve) + decrement_hugepage_resv_vma(vma); break; } @@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) } static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int avoid_reserve) { struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; @@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, * will not have accounted against quota. Check that the quota can be * made before satisfying the allocation */ - if (!vma_has_private_reserves(vma)) { + if (!(vma->vm_flags & VM_SHARED) && + !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { chg = 1; if (hugetlb_get_quota(inode->i_mapping, chg)) return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(vma, addr); + page = dequeue_huge_page_vma(vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); if (!page) { @@ -909,7 +938,7 @@ nomem: } void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, struct page *ref_page) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pmd_unshare(mm, &address, ptep)) continue; + /* + * If a reference page is supplied, it is because a specific + * page is being unmapped, not a range. Ensure the page we + * are about to unmap is the actual page of interest. + */ + if (ref_page) { + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) + continue; + page = pte_page(pte); + if (page != ref_page) + continue; + + /* + * Mark the VMA as having unmapped its page so that + * future faults in this VMA will fail rather than + * looking like data was lost + */ + set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } + pte = huge_ptep_get_and_clear(mm, address, ptep); if (huge_pte_none(pte)) continue; @@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, struct page *ref_page) { /* * It is undesirable to test vma->vm_file as it should be non-null @@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, */ if (vma->vm_file) { spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - __unmap_hugepage_range(vma, start, end); + __unmap_hugepage_range(vma, start, end, ref_page); spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); } } +/* + * This is called when the original mapper is failing to COW a MAP_PRIVATE + * mappping it owns the reserve page for. The intention is to unmap the page + * from other VMAs and let the children be SIGKILLed if they are faulting the + * same region. + */ +int unmap_ref_private(struct mm_struct *mm, + struct vm_area_struct *vma, + struct page *page, + unsigned long address) +{ + struct vm_area_struct *iter_vma; + struct address_space *mapping; + struct prio_tree_iter iter; + pgoff_t pgoff; + + /* + * vm_pgoff is in PAGE_SIZE units, hence the different calculation + * from page cache lookup which is in HPAGE_SIZE units. + */ + address = address & huge_page_mask(hstate_vma(vma)); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); + mapping = (struct address_space *)page_private(page); + + vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + /* Do not unmap the current VMA */ + if (iter_vma == vma) + continue; + + /* + * Unmap the page from other VMAs without their own reserves. + * They get marked to be SIGKILLed if they fault in these + * areas. This is because a future no-page fault on this VMA + * could insert a zeroed page instead of the data existing + * from the time of fork. This would look like data corruption + */ + if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) + unmap_hugepage_range(iter_vma, + address, address + HPAGE_SIZE, + page); + } + + return 1; +} + static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte) + unsigned long address, pte_t *ptep, pte_t pte, + struct page *pagecache_page) { struct page *old_page, *new_page; int avoidcopy; + int outside_reserve = 0; old_page = pte_page(pte); +retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ avoidcopy = (page_count(old_page) == 1); @@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } + /* + * If the process that created a MAP_PRIVATE mapping is about to + * perform a COW due to a shared page count, attempt to satisfy + * the allocation without using the existing reserves. The pagecache + * page is used to determine if the reserve at this address was + * consumed or not. If reserves were used, a partial faulted mapping + * at the time of fork() could consume its reserves on COW instead + * of the full address range. + */ + if (!(vma->vm_flags & VM_SHARED) && + is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + old_page != pagecache_page) + outside_reserve = 1; + page_cache_get(old_page); - new_page = alloc_huge_page(vma, address); + new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { page_cache_release(old_page); + + /* + * If a process owning a MAP_PRIVATE mapping fails to COW, + * it is due to references held by a child and an insufficient + * huge page pool. To guarantee the original mappers + * reliability, unmap the page from child processes. The child + * may get SIGKILLed if it later faults. + */ + if (outside_reserve) { + BUG_ON(huge_pte_none(pte)); + if (unmap_ref_private(mm, vma, old_page, address)) { + BUG_ON(page_count(old_page) != 1); + BUG_ON(huge_pte_none(pte)); + goto retry_avoidcopy; + } + WARN_ON_ONCE(1); + } + return -PTR_ERR(new_page); } @@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } +/* Return the pagecache page at a given address within a VMA */ +static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, + unsigned long address) +{ + struct address_space *mapping; + unsigned long idx; + + mapping = vma->vm_file->f_mapping; + idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + return find_lock_page(mapping, idx); +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { @@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping; pte_t new_pte; + /* + * Currently, we are forced to kill the process in the event the + * original mapper has unmapped pages from the child due to a failed + * COW. Warn that such a situation has occured as it may not be obvious + */ + if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { + printk(KERN_WARNING + "PID %d killed due to inadequate hugepage pool\n", + current->pid); + return ret; + } + mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -1039,7 +1196,7 @@ retry: size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) goto out; - page = alloc_huge_page(vma, address); + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = -PTR_ERR(page); goto out; @@ -1081,7 +1238,7 @@ retry: if (write_access && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } spin_unlock(&mm->page_table_lock); @@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) - ret = hugetlb_cow(mm, vma, address, ptep, entry); + if (write_access && !pte_write(entry)) { + struct page *page; + page = hugetlbfs_pagecache_page(vma, address); + ret = hugetlb_cow(mm, vma, address, ptep, entry, page); + if (page) { + unlock_page(page); + put_page(page); + } + } spin_unlock(&mm->page_table_lock); mutex_unlock(&hugetlb_instantiation_mutex); @@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode, else { chg = to - from; set_vma_resv_huge_pages(vma, chg); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) diff --git a/mm/memory.c b/mm/memory.c index 82f3f1c5cf17..72932489a082 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -901,7 +901,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } if (unlikely(is_vm_hugetlb_page(vma))) { - unmap_hugepage_range(vma, start, end); + unmap_hugepage_range(vma, start, end, NULL); zap_work -= (end - start) / (HPAGE_SIZE / PAGE_SIZE); start = end; -- cgit v1.2.2 From e7c4b0bfd025f71cf7624b7c1be174f63caade33 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:26 -0700 Subject: huge page private reservation review cleanups Create some new accessors for vma private data to cut down on and contain the casts. Encapsulates the huge and small page offset calculations. Also adds a couple of VM_BUG_ONs for consistency. [akpm@linux-foundation.org: Make things static] Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a2d29b84501f..3e873f0101fb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,28 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * Convert the address within this vma to the page offset within + * the mapping, in base page units. + */ +static pgoff_t vma_page_offset(struct vm_area_struct *vma, + unsigned long address) +{ + return ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, + unsigned long address) +{ + return ((address - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); +} + #define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) #define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) @@ -53,36 +75,48 @@ static DEFINE_SPINLOCK(hugetlb_lock); * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK; + return get_vma_private_data(vma) & ~HPAGE_RESV_MASK; return 0; } static void set_vma_resv_huge_pages(struct vm_area_struct *vma, unsigned long reserve) { - unsigned long flags; VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK; - vma->vm_private_data = (void *)(reserve | flags); + set_vma_private_data(vma, + (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - unsigned long reserveflags = (unsigned long)vma->vm_private_data; VM_BUG_ON(!is_vm_hugetlb_page(vma)); - vma->vm_private_data = (void *)(reserveflags | flags); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - return ((unsigned long)vma->vm_private_data & flag) != 0; + + return (get_vma_private_data(vma) & flag) != 0; } /* Decrement the reserved pages in the hugepage pool by one */ @@ -1151,11 +1185,10 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping; - unsigned long idx; + pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = ((address - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + idx = vma_pagecache_offset(vma, address); return find_lock_page(mapping, idx); } @@ -1164,7 +1197,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; - unsigned long idx; + pgoff_t idx; unsigned long size; struct page *page; struct address_space *mapping; @@ -1183,8 +1216,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = ((address - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + idx = vma_pagecache_offset(vma, address); /* * Use page lock to guard against racing truncation -- cgit v1.2.2 From cdfd4325c0d878679bd6a3ba8285b71d9980e3c0 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:28 -0700 Subject: mm: record MAP_NORESERVE status on vmas and fix small page mprotect reservations With Mel's hugetlb private reservation support patches applied, strict overcommit semantics are applied to both shared and private huge page mappings. This can be a problem if an application relied on unlimited overcommit semantics for private mappings. An example of this would be an application which maps a huge area with the intention of using it very sparsely. These application would benefit from being able to opt-out of the strict overcommit. It should be noted that prior to hugetlb supporting demand faulting all mappings were fully populated and so applications of this type should be rare. This patch stack implements the MAP_NORESERVE mmap() flag for huge page mappings. This flag has the same meaning as for small page mappings, suppressing reservations for that mapping. Thanks to Mel Gorman for reviewing a number of early versions of these patches. This patch: When a small page mapping is created with mmap() reservations are created by default for any memory pages required. When the region is read/write the reservation is increased for every page, no reservation is needed for read-only regions (as they implicitly share the zero page). Reservations are tracked via the VM_ACCOUNT vma flag which is present when the region has reservation backing it. When we convert a region from read-only to read-write new reservations are aquired and VM_ACCOUNT is set. However, when a read-only map is created with MAP_NORESERVE it is indistinguishable from a normal mapping. When we then convert that to read/write we are forced to incorrectly create reservations for it as we have no record of the original MAP_NORESERVE. This patch introduces a new vma flag VM_NORESERVE which records the presence of the original MAP_NORESERVE flag. This allows us to distinguish these two circumstances and correctly account the reserve. As well as fixing this FIXME in the code, this makes it much easier to introduce MAP_NORESERVE support for huge pages as this flag is available consistantly for the life of the mapping. VM_ACCOUNT on the other hand is heavily used at the generic level in association with small pages. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +++ mm/mprotect.c | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 75e0d0673d78..57d3b6097deb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1110,6 +1110,9 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; + if (flags & MAP_NORESERVE) + vm_flags |= VM_NORESERVE; + if (accountable && (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { if (vm_flags & VM_SHARED) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 360d9cc8b38c..abd645a3b0a0 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -153,12 +153,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. - * - * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting - * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; -- cgit v1.2.2 From 9682290484370ce68ba23cd2ec2838e301934199 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:29 -0700 Subject: hugetlb: move reservation region support earlier The following patch will require use of the reservation regions support. Move this earlier in the file. No changes have been made to this code. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Acked-by: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 246 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 125 insertions(+), 121 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e873f0101fb..05bc9af4fca9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,131 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. + */ +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarantee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + /* * Convert the address within this vma to the page offset within * the mapping, in base page units. @@ -1429,127 +1554,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } -struct file_region { - struct list_head link; - long from; - long to; -}; - -static long region_add(struct list_head *head, long f, long t) -{ - struct file_region *rg, *nrg, *trg; - - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - list_del(&rg->link); - kfree(rg); - } - } - nrg->from = f; - nrg->to = t; - return 0; -} - -static long region_chg(struct list_head *head, long f, long t) -{ - struct file_region *rg, *nrg; - long chg = 0; - - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); - - return t - f; - } - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - return chg; - - /* We overlap with this area, if it extends futher than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - return chg; -} - -static long region_truncate(struct list_head *head, long end) -{ - struct file_region *rg, *trg; - long chg = 0; - - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (end <= rg->to) - break; - if (&rg->link == head) - return 0; - - /* If we are in the middle of a region then adjust it. */ - if (end > rg->from) { - chg = rg->to - end; - rg->to = end; - rg = list_entry(rg->link.next, typeof(*rg), link); - } - - /* Drop any remaining regions. */ - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - chg += rg->to - rg->from; - list_del(&rg->link); - kfree(rg); - } - return chg; -} - int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma) -- cgit v1.2.2 From c37f9fb11c976ffc08200d631dada6dcbfd07ea4 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:30 -0700 Subject: hugetlb: allow huge page mappings to be created without reservations By default all shared mappings and most private mappings now have reservations associated with them. This improves semantics by providing allocation guarentees to the mapper. However a small number of applications may attempt to make very large sparse mappings, with these strict reservations the system will never be able to honour the mapping. This patch set brings MAP_NORESERVE support to hugetlb files. This allows new mappings to be made to hugetlbfs files without an associated reservation, for both shared and private mappings. This allows applications which want to create very sparse mappings to opt-out of the reservation system. Obviously as there is no reservation they are liable to fault at runtime if the huge page pool becomes exhausted; buyer beware. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05bc9af4fca9..72acbb29d2cc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -247,6 +247,9 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) /* Decrement the reserved pages in the hugepage pool by one */ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) { + if (vma->vm_flags & VM_NORESERVE) + return; + if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ resv_huge_pages--; @@ -720,25 +723,65 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) } } +/* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase + * reservation and actually increase quota before an allocation can occur. + * Where any new reservation would be required the reservation change is + * prepared, but not committed. Once the page has been quota'd allocated + * an instantiated the change should be committed via vma_commit_reservation. + * No action is required on failure. + */ +static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + return region_chg(&inode->i_mapping->private_list, + idx, idx + 1); + + } else { + if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + } + + return 0; +} +static void vma_commit_reservation(struct vm_area_struct *vma, + unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + region_add(&inode->i_mapping->private_list, idx, idx + 1); + } +} + static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg = 0; + unsigned int chg; /* * Processes that did not create the mapping will have no reserves and * will not have accounted against quota. Check that the quota can be * made before satisfying the allocation + * MAP_NORESERVE mappings may also need pages and quota allocated + * if no reserve mapping overlaps. */ - if (!(vma->vm_flags & VM_SHARED) && - !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - chg = 1; + chg = vma_needs_reservation(vma, addr); + if (chg < 0) + return ERR_PTR(chg); + if (chg) if (hugetlb_get_quota(inode->i_mapping, chg)) return ERR_PTR(-ENOSPC); - } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(vma, addr, avoid_reserve); @@ -755,6 +798,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); + vma_commit_reservation(vma, addr); + return page; } @@ -1560,6 +1605,9 @@ int hugetlb_reserve_pages(struct inode *inode, { long ret, chg; + if (vma && vma->vm_flags & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need -- cgit v1.2.2 From 84afd99b8398c9d73af8238aa3cd835858e3097a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:32 -0700 Subject: hugetlb reservations: fix hugetlb MAP_PRIVATE reservations across vma splits When a hugetlb mapping with a reservation is split, a new VMA is cloned from the original. This new VMA is a direct copy of the original including the reservation count. When this pair of VMAs are unmapped we will incorrect double account the unused reservation and the overall reservation count will be incorrect, in extreme cases it will wrap. The problem occurs when we split an existing VMA say to unmap a page in the middle. split_vma() will create a new VMA copying all fields from the original. As we are storing our reservation count in vm_private_data this is also copies, endowing the new VMA with a duplicate of the original VMA's reservation. Neither of the new VMAs can exhaust these reservations as they are too small, but when we unmap and close these VMAs we will incorrect credit the remainder twice and resv_huge_pages will become out of sync. This can lead to allocation failures on mappings with reservations and even to resv_huge_pages wrapping which prevents all subsequent hugepage allocations. The simple fix would be to correctly apportion the remaining reservation count when the split is made. However the only hook we have vm_ops->open only has the new VMA we do not know the identity of the preceeding VMA. Also even if we did have that VMA to hand we do not know how much of the reservation was consumed each side of the split. This patch therefore takes a different tack. We know that the whole of any private mapping (which has a reservation) has a reservation over its whole size. Any present pages represent consumed reservation. Therefore if we track the instantiated pages we can calculate the remaining reservation. This patch reuses the existing regions code to track the regions for which we have consumed reservation (ie. the instantiated pages), as each page is faulted in we record the consumption of reservation for the new page. When we need to return unused reservations at unmap time we simply count the consumed reservation region subtracting that from the whole of the map. During a VMA split the newly opened VMA will point to the same region map, as this map is offset oriented it remains valid for both of the split VMAs. This map is referenced counted so that it is removed when all VMAs which are part of the mmap are gone. Thanks to Adam Litke and Mel Gorman for their review feedback. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Cc: Jon Tollefson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 172 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 145 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 72acbb29d2cc..65616941a383 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -43,6 +43,16 @@ static DEFINE_SPINLOCK(hugetlb_lock); /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. + * + * The region data structures are protected by a combination of the mmap_sem + * and the hugetlb_instantion_mutex. To access or modify a region the caller + * must either hold the mmap_sem for write, or the mmap_sem for read and + * the hugetlb_instantiation mutex: + * + * down_write(&mm->mmap_sem); + * or + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -165,6 +175,30 @@ static long region_truncate(struct list_head *head, long end) return chg; } +static long region_count(struct list_head *head, long f, long t) +{ + struct file_region *rg; + long chg = 0; + + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + int seg_from; + int seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + + return chg; +} + /* * Convert the address within this vma to the page offset within * the mapping, in base page units. @@ -187,9 +221,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); } -#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) -#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() @@ -199,6 +239,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { @@ -211,22 +260,48 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) +struct resv_map { + struct kref refs; + struct list_head regions; +}; + +struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(&resv_map->regions, 0); + kfree(resv_map); +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return get_vma_private_data(vma) & ~HPAGE_RESV_MASK; + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); return 0; } -static void set_vma_resv_huge_pages(struct vm_area_struct *vma, - unsigned long reserve) +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - set_vma_private_data(vma, - (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve); + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) @@ -253,19 +328,12 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ resv_huge_pages--; - } else { + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { /* * Only the process that called mmap() has reserves for * private mappings. */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - unsigned long flags, reserve; - resv_huge_pages--; - flags = (unsigned long)vma->vm_private_data & - HPAGE_RESV_MASK; - reserve = (unsigned long)vma->vm_private_data - 1; - vma->vm_private_data = (void *)(reserve | flags); - } + resv_huge_pages--; } } @@ -282,7 +350,7 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) return 0; - if (!vma_resv_huge_pages(vma)) + if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 0; return 1; } @@ -742,12 +810,19 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) return region_chg(&inode->i_mapping->private_list, idx, idx + 1); - } else { - if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; - } + } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + return 1; - return 0; + } else { + int err; + pgoff_t idx = vma_pagecache_offset(vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + err = region_chg(&reservations->regions, idx, idx + 1); + if (err < 0) + return err; + return 0; + } } static void vma_commit_reservation(struct vm_area_struct *vma, unsigned long addr) @@ -758,6 +833,13 @@ static void vma_commit_reservation(struct vm_area_struct *vma, if (vma->vm_flags & VM_SHARED) { pgoff_t idx = vma_pagecache_offset(vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); + + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + /* Mark this page used in the map. */ + region_add(&reservations->regions, idx, idx + 1); } } @@ -1047,11 +1129,41 @@ out: return ret; } +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + /* + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot dissappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (reservations) + kref_get(&reservations->refs); +} + static void hugetlb_vm_op_close(struct vm_area_struct *vma) { - unsigned long reserve = vma_resv_huge_pages(vma); - if (reserve) - hugetlb_acct_memory(-reserve); + struct resv_map *reservations = vma_resv_map(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; + + if (reservations) { + start = vma_pagecache_offset(vma, vma->vm_start); + end = vma_pagecache_offset(vma, vma->vm_end); + + reserve = (end - start) - + region_count(&reservations->regions, start, end); + + kref_put(&reservations->refs, resv_map_release); + + if (reserve) + hugetlb_acct_memory(-reserve); + } } /* @@ -1068,6 +1180,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, }; @@ -1617,8 +1730,13 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; - set_vma_resv_huge_pages(vma, chg); + + set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } -- cgit v1.2.2 From a858f7b2e9bb4eb665176dde5cf32eeaaf90f153 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:27:33 -0700 Subject: vma_page_offset() has no callees: drop it Hugh adds: vma_pagecache_offset() has a dangerously misleading name, since it's using hugepage units: rename it to vma_hugecache_offset(). [apw@shadowen.org: restack onto fixed MAP_PRIVATE reservations] [akpm@linux-foundation.org: vma_split conversion] Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: Nishanth Aravamudan Cc: Andi Kleen Cc: Nick Piggin Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65616941a383..eda9642254a0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -199,22 +199,11 @@ static long region_count(struct list_head *head, long f, long t) return chg; } -/* - * Convert the address within this vma to the page offset within - * the mapping, in base page units. - */ -static pgoff_t vma_page_offset(struct vm_area_struct *vma, - unsigned long address) -{ - return ((address - vma->vm_start) >> PAGE_SHIFT) + - (vma->vm_pgoff >> PAGE_SHIFT); -} - /* * Convert the address within this vma to the page offset within * the mapping, in pagecache page units; huge pages here. */ -static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, +static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, unsigned long address) { return ((address - vma->vm_start) >> HPAGE_SHIFT) + @@ -806,7 +795,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -815,7 +804,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) } else { int err; - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); struct resv_map *reservations = vma_resv_map(vma); err = region_chg(&reservations->regions, idx, idx + 1); @@ -831,11 +820,11 @@ static void vma_commit_reservation(struct vm_area_struct *vma, struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); struct resv_map *reservations = vma_resv_map(vma); /* Mark this page used in the map. */ @@ -1153,8 +1142,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) unsigned long end; if (reservations) { - start = vma_pagecache_offset(vma, vma->vm_start); - end = vma_pagecache_offset(vma, vma->vm_end); + start = vma_hugecache_offset(vma, vma->vm_start); + end = vma_hugecache_offset(vma, vma->vm_end); reserve = (end - start) - region_count(&reservations->regions, start, end); @@ -1471,7 +1460,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = vma_pagecache_offset(vma, address); + idx = vma_hugecache_offset(vma, address); return find_lock_page(mapping, idx); } @@ -1499,7 +1488,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = vma_pagecache_offset(vma, address); + idx = vma_hugecache_offset(vma, address); /* * Use page lock to guard against racing truncation -- cgit v1.2.2 From 11fa977ecde652ab324dd79c179deb52e82a8df1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2008 21:27:34 -0700 Subject: generic_file_aio_read() cleanups As akpm points out, there's really no need for generic_file_aio_read to make a special case of count 0: just loop through nr_segs doing nothing. And as Harvey Harrison points out, there's no need to reset retval to 0 where it's already 0. Setting count (or ocount) to 0 before calling generic_segment_checks is unnecessary too; but reluctantly I'll leave that removal to someone with a wider range of gcc versions to hand - 4.1.2 and 4.2.1 don't warn about it, but perhaps others do - I forget which are the warniest versions. Signed-off-by: Hugh Dickins Tested-by: Lawrence Greenfield Cc: Christoph Rohland Cc: Badari Pulavarty Cc: Zach Brown Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6343f3c841b7..7675b91f4f63 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1197,7 +1197,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, mapping = filp->f_mapping; inode = mapping->host; - retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); @@ -1209,33 +1208,30 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, } if (retval > 0) *ppos = pos + retval; - } - if (likely(retval != 0)) { - file_accessed(filp); - goto out; + if (retval) { + file_accessed(filp); + goto out; + } } } - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp,ppos,&desc,file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; } + if (desc.count > 0) + break; } out: return retval; -- cgit v1.2.2 From bcd78e49613c41b5bed96fa288e983876f286a59 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2008 21:27:35 -0700 Subject: tmpfs: support aio We have a request for tmpfs to support the AIO interface: easily done, no more than replacing the old shmem_file_read by shmem_file_aio_read, cribbed from generic_file_aio_read. (In 2.6.25 its write side was already changed to use generic_file_aio_write.) Incorporate cleanups from Andrew Morton and Harvey Harrison. Tests out fine with LTP's ltp-aiodio.sh, given hacks (not included) to support O_DIRECT. tmpfs cannot honestly support O_DIRECT: its cache-avoiding-IO nature is at odds with direct IO-avoiding-cache. Signed-off-by: Hugh Dickins Tested-by: Lawrence Greenfield Cc: Christoph Rohland Cc: Badari Pulavarty Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index e2a6ae1a44e9..9ffbea9b79e1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1690,26 +1690,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ file_accessed(filp); } -static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - read_descriptor_t desc; - - if ((ssize_t) count < 0) - return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.buf = buf; - desc.error = 0; - - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - if (desc.written) - return desc.written; - return desc.error; +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_shmem_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } + return retval; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -2369,8 +2381,9 @@ static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, - .read = shmem_file_read, + .read = do_sync_read, .write = do_sync_write, + .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, -- cgit v1.2.2 From a47a126ad5ea072aca3e611ed8f8dc6adad24bab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Jul 2008 21:27:38 -0700 Subject: vmallocinfo: add NUMA information Christoph recently added /proc/vmallocinfo file to get information about vmalloc allocations. This patch adds NUMA specific information, giving number of pages allocated on each memory node. This should help to check that vmalloc() is able to respect NUMA policies. Example of output on a four nodes machine (one cpu per node) 1) network hash tables are evenly spreaded on four nodes (OK) (Same point for inodes and dentries hash tables) 2) iptables tables (x_tables) are correctly allocated on each cpu node (OK). 3) sys_swapon() allocates its memory from one node only. 4) each loaded module is using memory on one node. Sysadmins could tune their setup to change points 3) and 4) if necessary. grep "pages=" /proc/vmallocinfo 0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204/0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128 0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204/0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64 0xffffc2000031a000-0xffffc2000031d000 12288 alloc_large_system_hash+0x204/0x2c0 pages=2 vmalloc N1=1 N2=1 0xffffc2000031f000-0xffffc2000032b000 49152 cramfs_uncompress_init+0x2e/0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3 0xffffc2000033e000-0xffffc20000341000 12288 sys_swapon+0x640/0xac0 pages=2 vmalloc N0=2 0xffffc20000341000-0xffffc20000344000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N0=2 0xffffc20000344000-0xffffc20000347000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N1=2 0xffffc20000347000-0xffffc2000034a000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N2=2 0xffffc2000034a000-0xffffc2000034d000 12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N3=2 0xffffc20004381000-0xffffc20004402000 528384 alloc_large_system_hash+0x204/0x2c0 pages=128 vmalloc N0=32 N1=32 N2=32 N3=32 0xffffc20004402000-0xffffc20004803000 4198400 alloc_large_system_hash+0x204/0x2c0 pages=1024 vmalloc vpages N0=256 N1=256 N2=256 N3=256 0xffffc20004803000-0xffffc20004904000 1052672 alloc_large_system_hash+0x204/0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64 0xffffc20004904000-0xffffc20004bec000 3047424 sys_swapon+0x640/0xac0 pages=743 vmalloc vpages N0=743 0xffffffffa0000000-0xffffffffa000f000 61440 sys_init_module+0xc27/0x1d00 pages=14 vmalloc N1=14 0xffffffffa000f000-0xffffffffa0014000 20480 sys_init_module+0xc27/0x1d00 pages=4 vmalloc N0=4 0xffffffffa0014000-0xffffffffa0017000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N0=2 0xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 pages=10 vmalloc N1=10 0xffffffffa0022000-0xffffffffa0028000 24576 sys_init_module+0xc27/0x1d00 pages=5 vmalloc N3=5 0xffffffffa0028000-0xffffffffa0050000 163840 sys_init_module+0xc27/0x1d00 pages=39 vmalloc N1=39 0xffffffffa0050000-0xffffffffa0052000 8192 sys_init_module+0xc27/0x1d00 pages=1 vmalloc N1=1 0xffffffffa0052000-0xffffffffa0056000 16384 sys_init_module+0xc27/0x1d00 pages=3 vmalloc N1=3 0xffffffffa0056000-0xffffffffa0081000 176128 sys_init_module+0xc27/0x1d00 pages=42 vmalloc N3=42 0xffffffffa0081000-0xffffffffa00ae000 184320 sys_init_module+0xc27/0x1d00 pages=44 vmalloc N3=44 0xffffffffa00ae000-0xffffffffa00b1000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2 0xffffffffa00b1000-0xffffffffa00b9000 32768 sys_init_module+0xc27/0x1d00 pages=7 vmalloc N0=7 0xffffffffa00b9000-0xffffffffa00c4000 45056 sys_init_module+0xc27/0x1d00 pages=10 vmalloc N3=10 0xffffffffa00c6000-0xffffffffa00e0000 106496 sys_init_module+0xc27/0x1d00 pages=25 vmalloc N2=25 0xffffffffa00e0000-0xffffffffa00f1000 69632 sys_init_module+0xc27/0x1d00 pages=16 vmalloc N2=16 0xffffffffa00f1000-0xffffffffa00f4000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2 0xffffffffa00f4000-0xffffffffa00f7000 12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2 [akpm@linux-foundation.org: fix comment] Signed-off-by: Eric Dumazet Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..35f293816294 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -931,6 +931,25 @@ static void s_stop(struct seq_file *m, void *p) read_unlock(&vmlist_lock); } +static void show_numa_info(struct seq_file *m, struct vm_struct *v) +{ + if (NUMA_BUILD) { + unsigned int nr, *counters = m->private; + + if (!counters) + return; + + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + + for (nr = 0; nr < v->nr_pages; nr++) + counters[page_to_nid(v->pages[nr])]++; + + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); + } +} + static int s_show(struct seq_file *m, void *p) { struct vm_struct *v = p; @@ -967,6 +986,7 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_VPAGES) seq_printf(m, " vpages"); + show_numa_info(m, v); seq_putc(m, '\n'); return 0; } -- cgit v1.2.2 From 5e9426abe209cf134adbbd62c5e73ef534eb73e9 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 23 Jul 2008 21:27:39 -0700 Subject: mm: remove mm_init compilation dependency on CONFIG_DEBUG_MEMORY_INIT Towards the end of putting all core mm initialization in mm_init.c, I plan on putting the creation of a mm kobject in a function in that file. However, the file is currently only compiled if CONFIG_DEBUG_MEMORY_INIT is set. Remove this dependency, but put the code under an #ifdef on the same config option. This should result in no functional changes. Signed-off-by: Nishanth Aravamudan Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 3 +-- mm/mm_init.c | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 4bbc8f094ff0..06ca2381fef1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o mm_init.o $(mmu-y) obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -26,7 +26,6 @@ obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o -obj-$(CONFIG_DEBUG_MEMORY_INIT) += mm_init.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/mm_init.c b/mm/mm_init.c index ce445ca097e7..eaf0d3b47099 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -9,6 +9,7 @@ #include #include "internal.h" +#ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; /* The zonelists are simply reported, validation is manual. */ @@ -132,3 +133,4 @@ static __init int set_mminit_loglevel(char *str) return 0; } early_param("mminit_loglevel", set_mminit_loglevel); +#endif /* CONFIG_DEBUG_MEMORY_INIT */ -- cgit v1.2.2 From ff7ea79cf7c3a481851bd4b2185fdeb6ce4afa29 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 23 Jul 2008 21:27:39 -0700 Subject: mm: create /sys/kernel/mm Add a kobject to create /sys/kernel/mm when sysfs is mounted. The kobject will exist regardless. This will allow for the hugepage related sysfs directories to exist under the mm "subsystem" directory. Add an ABI file appropriately. [kosaki.motohiro@jp.fujitsu.com: fix build] Signed-off-by: Nishanth Aravamudan Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mm_init.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index eaf0d3b47099..c6af41ea9994 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -7,6 +7,8 @@ */ #include #include +#include +#include #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -134,3 +136,17 @@ static __init int set_mminit_loglevel(char *str) } early_param("mminit_loglevel", set_mminit_loglevel); #endif /* CONFIG_DEBUG_MEMORY_INIT */ + +struct kobject *mm_kobj; +EXPORT_SYMBOL_GPL(mm_kobj); + +static int __init mm_sysfs_init(void) +{ + mm_kobj = kobject_create_and_add("mm", kernel_kobj); + if (!mm_kobj) + return -ENOMEM; + + return 0; +} + +__initcall(mm_sysfs_init); -- cgit v1.2.2 From b7ba30c679ed1eb7ed3ed8f281f6493282042bd4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:40 -0700 Subject: hugetlb: factor out prep_new_huge_page Needed to avoid code duplication in follow up patches. Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eda9642254a0..32dff4290c66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -513,6 +513,16 @@ static int adjust_pool_surplus(int delta) return ret; } +static void prep_new_huge_page(struct page *page, int nid) +{ + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + nr_huge_pages++; + nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ +} + static struct page *alloc_fresh_huge_page_node(int nid) { struct page *page; @@ -526,12 +536,7 @@ static struct page *alloc_fresh_huge_page_node(int nid) __free_pages(page, HUGETLB_PAGE_ORDER); return NULL; } - set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[nid]++; - spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ + prep_new_huge_page(page, nid); } return page; -- cgit v1.2.2 From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:41 -0700 Subject: hugetlb: modular state for hugetlb page size The goal of this patchset is to support multiple hugetlb page sizes. This is achieved by introducing a new struct hstate structure, which encapsulates the important hugetlb state and constants (eg. huge page size, number of huge pages currently allocated, etc). The hstate structure is then passed around the code which requires these fields, they will do the right thing regardless of the exact hstate they are operating on. This patch adds the hstate structure, with a single global instance of it (default_hstate), and does the basic work of converting hugetlb to use the hstate. Future patches will add more hstate structures to allow for different hugetlbfs mounts to have different page sizes. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 368 +++++++++++++++++++++++++++++++-------------------------- mm/memory.c | 2 +- mm/mempolicy.c | 9 +- mm/mmap.c | 3 +- 4 files changed, 210 insertions(+), 172 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 32dff4290c66..0d8153e25f09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,18 +22,12 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; -static unsigned long surplus_huge_pages; -static unsigned long nr_overcommit_huge_pages; unsigned long max_huge_pages; unsigned long sysctl_overcommit_huge_pages; -static struct list_head hugepage_freelists[MAX_NUMNODES]; -static unsigned int nr_huge_pages_node[MAX_NUMNODES]; -static unsigned int free_huge_pages_node[MAX_NUMNODES]; -static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int hugetlb_next_nid; + +struct hstate default_hstate; /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t) * Convert the address within this vma to the page offset within * the mapping, in pagecache page units; huge pages here. */ -static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, - unsigned long address) +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { - return ((address - vma->vm_start) >> HPAGE_SHIFT) + - (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); } /* @@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) } /* Decrement the reserved pages in the hugepage pool by one */ -static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) +static void decrement_hugepage_resv_vma(struct hstate *h, + struct vm_area_struct *vma) { if (vma->vm_flags & VM_NORESERVE) return; if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ - resv_huge_pages--; + h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { /* * Only the process that called mmap() has reserves for * private mappings. */ - resv_huge_pages--; + h->resv_huge_pages--; } } @@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) return 1; } -static void clear_huge_page(struct page *page, unsigned long addr) +static void clear_huge_page(struct page *page, + unsigned long addr, unsigned long sz) { int i; might_sleep(); - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + for (i = 0; i < sz/PAGE_SIZE; i++) { cond_resched(); clear_user_highpage(page + i, addr + i * PAGE_SIZE); } @@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; + struct hstate *h = hstate_vma(vma); might_sleep(); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } -static void enqueue_huge_page(struct page *page) +static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &hugepage_freelists[nid]); - free_huge_pages++; - free_huge_pages_node[nid]++; + list_add(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct hstate *h) { int nid; struct page *page = NULL; for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; break; } } return page; } -static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { int nid; @@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, * not "stolen". The child may still get SIGKILLed */ if (!vma_has_private_reserves(vma) && - free_huge_pages - resv_huge_pages == 0) + h->free_huge_pages - h->resv_huge_pages == 0) return NULL; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) + if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + !list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; if (!avoid_reserve) - decrement_hugepage_resv_vma(vma); + decrement_hugepage_resv_vma(h, vma); break; } @@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, return page; } -static void update_and_free_page(struct page *page) +static void update_and_free_page(struct hstate *h, struct page *page) { int i; - nr_huge_pages--; - nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { + + h->nr_huge_pages--; + h->nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); @@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page) set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); } static void free_huge_page(struct page *page) { + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = &default_hstate; int nid = page_to_nid(page); struct address_space *mapping; @@ -466,12 +470,12 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (surplus_huge_pages_node[nid]) { - update_and_free_page(page); - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + if (h->surplus_huge_pages_node[nid]) { + update_and_free_page(h, page); + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; } else { - enqueue_huge_page(page); + enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); if (mapping) @@ -483,7 +487,7 @@ static void free_huge_page(struct page *page) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(int delta) +static int adjust_pool_surplus(struct hstate *h, int delta) { static int prev_nid; int nid = prev_nid; @@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta) nid = first_node(node_online_map); /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !surplus_huge_pages_node[nid]) + if (delta < 0 && !h->surplus_huge_pages_node[nid]) continue; /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && surplus_huge_pages_node[nid] >= - nr_huge_pages_node[nid]) + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) continue; - surplus_huge_pages += delta; - surplus_huge_pages_node[nid] += delta; + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; ret = 1; break; } while (nid != prev_nid); @@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta) return ret; } -static void prep_new_huge_page(struct page *page, int nid) +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[nid]++; + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); put_page(page); /* free it into the hugepage allocator */ } -static struct page *alloc_fresh_huge_page_node(int nid) +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { __free_pages(page, HUGETLB_PAGE_ORDER); return NULL; } - prep_new_huge_page(page, nid); + prep_new_huge_page(h, page, nid); } return page; } -static int alloc_fresh_huge_page(void) +static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = hugetlb_next_nid; + start_nid = h->hugetlb_next_nid; do { - page = alloc_fresh_huge_page_node(hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; /* @@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void) * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ - next_nid = next_node(hugetlb_next_nid, node_online_map); + next_nid = next_node(h->hugetlb_next_nid, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - hugetlb_next_nid = next_nid; - } while (!page && hugetlb_next_nid != start_nid); + h->hugetlb_next_nid = next_nid; + } while (!page && h->hugetlb_next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void) return ret; } -static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, - unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct page *page; unsigned int nid; @@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * per-node value is checked there. */ spin_lock(&hugetlb_lock); - if (surplus_huge_pages >= nr_overcommit_huge_pages) { + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { spin_unlock(&hugetlb_lock); return NULL; } else { - nr_huge_pages++; - surplus_huge_pages++; + h->nr_huge_pages++; + h->surplus_huge_pages++; } spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); spin_lock(&hugetlb_lock); if (page) { @@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, /* * We incremented the global counters already */ - nr_huge_pages_node[nid]++; - surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[nid]++; + h->surplus_huge_pages_node[nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { - nr_huge_pages--; - surplus_huge_pages--; + h->nr_huge_pages--; + h->surplus_huge_pages--; __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ -static int gather_surplus_pages(int delta) +static int gather_surplus_pages(struct hstate *h, int delta) { struct list_head surplus_list; struct page *page, *tmp; int ret, i; int needed, allocated; - needed = (resv_huge_pages + delta) - free_huge_pages; + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { - resv_huge_pages += delta; + h->resv_huge_pages += delta; return 0; } @@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(NULL, 0); + page = alloc_buddy_huge_page(h, NULL, 0); if (!page) { /* * We were not able to allocate enough pages to @@ -694,7 +698,8 @@ retry: * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock(&hugetlb_lock); - needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); if (needed > 0) goto retry; @@ -707,7 +712,7 @@ retry: * before they are reserved. */ needed += allocated; - resv_huge_pages += delta; + h->resv_huge_pages += delta; ret = 0; free: /* Free the needed pages to the hugetlb pool */ @@ -715,7 +720,7 @@ free: if ((--needed) < 0) break; list_del(&page->lru); - enqueue_huge_page(page); + enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ @@ -743,7 +748,8 @@ free: * allocated to satisfy the reservation must be explicitly freed if they were * never used. */ -static void return_unused_surplus_pages(unsigned long unused_resv_pages) +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) { static int nid = -1; struct page *page; @@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) unsigned long remaining_iterations = num_online_nodes(); /* Uncommit the reservation */ - resv_huge_pages -= unused_resv_pages; + h->resv_huge_pages -= unused_resv_pages; - nr_pages = min(unused_resv_pages, surplus_huge_pages); + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); - if (!surplus_huge_pages_node[nid]) + if (!h->surplus_huge_pages_node[nid]) continue; - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[nid]--; - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; nr_pages--; remaining_iterations = num_online_nodes(); } @@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) +static int vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) } else { int err; - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); err = region_chg(&reservations->regions, idx, idx + 1); @@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) return 0; } } -static void vma_commit_reservation(struct vm_area_struct *vma, - unsigned long addr) +static void vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); /* Mark this page used in the map. */ @@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma, static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { + struct hstate *h = hstate_vma(vma); struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, * MAP_NORESERVE mappings may also need pages and quota allocated * if no reserve mapping overlaps. */ - chg = vma_needs_reservation(vma, addr); + chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(chg); if (chg) @@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(vma, addr, avoid_reserve); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(vma, addr); + page = alloc_buddy_huge_page(h, vma, addr); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_OOM); @@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); - vma_commit_reservation(vma, addr); + vma_commit_reservation(h, vma, addr); return page; } @@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, static int __init hugetlb_init(void) { unsigned long i; + struct hstate *h = &default_hstate; if (HPAGE_SHIFT == 0) return 0; + if (!h->order) { + h->order = HPAGE_SHIFT - PAGE_SHIFT; + h->mask = HPAGE_MASK; + } + for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_freelists[i]); - hugetlb_next_nid = first_node(node_online_map); + h->hugetlb_next_nid = first_node(node_online_map); for (i = 0; i < max_huge_pages; ++i) { - if (!alloc_fresh_huge_page()) + if (!alloc_fresh_huge_page(h)) break; } - max_huge_pages = free_huge_pages = nr_huge_pages = i; - printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); + max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; + printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n", + h->free_huge_pages); return 0; } module_init(hugetlb_init); @@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM -static void try_to_free_low(unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count) { int i; for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; - list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { - if (count >= nr_huge_pages) + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) return; if (PageHighMem(page)) continue; list_del(&page->lru); update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[page_to_nid(page)]--; + h->free_huge_pages--; + h->free_huge_pages_node[page_to_nid(page)]--; } } } #else -static inline void try_to_free_low(unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count) { } #endif -#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(unsigned long count) { unsigned long min_count, ret; + struct hstate *h = &default_hstate; /* * Increase the pool size @@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count) * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); - while (surplus_huge_pages && count > persistent_huge_pages) { - if (!adjust_pool_surplus(-1)) + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, -1)) break; } - while (count > persistent_huge_pages) { + while (count > persistent_huge_pages(h)) { /* * If this allocation races such that we no longer need the * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(); + ret = alloc_fresh_huge_page(h); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count) * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. */ - min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; + min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(min_count); - while (min_count < persistent_huge_pages) { - struct page *page = dequeue_huge_page(); + try_to_free_low(h, min_count); + while (min_count < persistent_huge_pages(h)) { + struct page *page = dequeue_huge_page(h); if (!page) break; - update_and_free_page(page); + update_and_free_page(h, page); } - while (count < persistent_huge_pages) { - if (!adjust_pool_surplus(1)) + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, 1)) break; } out: - ret = persistent_huge_pages; + ret = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); return ret; } @@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + struct hstate *h = &default_hstate; proc_doulongvec_minmax(table, write, file, buffer, length, ppos); spin_lock(&hugetlb_lock); - nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; + h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; spin_unlock(&hugetlb_lock); return 0; } @@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, int hugetlb_report_meminfo(char *buf) { + struct hstate *h = &default_hstate; return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" "HugePages_Surp: %5lu\n" "Hugepagesize: %5lu kB\n", - nr_huge_pages, - free_huge_pages, - resv_huge_pages, - surplus_huge_pages, - HPAGE_SIZE/1024); + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } int hugetlb_report_node_meminfo(int nid, char *buf) { + struct hstate *h = &default_hstate; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", - nid, nr_huge_pages_node[nid], - nid, free_huge_pages_node[nid], - nid, surplus_huge_pages_node[nid]); + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); + struct hstate *h = &default_hstate; + return h->nr_huge_pages * pages_per_huge_page(h); } -static int hugetlb_acct_memory(long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; @@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta) * semantics that cpuset has. */ if (delta > 0) { - if (gather_surplus_pages(delta) < 0) + if (gather_surplus_pages(h, delta) < 0) goto out; - if (delta > cpuset_mems_nr(free_huge_pages_node)) { - return_unused_surplus_pages(delta); + if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { + return_unused_surplus_pages(h, delta); goto out; } } ret = 0; if (delta < 0) - return_unused_surplus_pages((unsigned long) -delta); + return_unused_surplus_pages(h, (unsigned long) -delta); out: spin_unlock(&hugetlb_lock); @@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) static void hugetlb_vm_op_close(struct vm_area_struct *vma) { + struct hstate *h = hstate_vma(vma); struct resv_map *reservations = vma_resv_map(vma); unsigned long reserve; unsigned long start; unsigned long end; if (reservations) { - start = vma_hugecache_offset(vma, vma->vm_start); - end = vma_hugecache_offset(vma, vma->vm_end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(&reservations->regions, start, end); @@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); if (reserve) - hugetlb_acct_memory(-reserve); + hugetlb_acct_memory(h, -reserve); } } @@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct page *ptepage; unsigned long addr; int cow; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr); + dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) goto nomem; @@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, pte_t pte; struct page *page; struct page *tmp; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + /* * A page gathering list, protected by per file i_mmap_lock. The * lock is used to avoid list corruption from multiple unmapping @@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, LIST_HEAD(page_list); WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON(start & ~HPAGE_MASK); - BUG_ON(end & ~HPAGE_MASK); + BUG_ON(start & ~huge_page_mask(h)); + BUG_ON(end & ~huge_page_mask(h)); spin_lock(&mm->page_table_lock); - for (address = start; address < end; address += HPAGE_SIZE) { + for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) { + struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int avoidcopy; int outside_reserve = 0; @@ -1443,7 +1471,7 @@ retry_avoidcopy: __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); - ptep = huge_pte_offset(mm, address & HPAGE_MASK); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); @@ -1458,14 +1486,14 @@ retry_avoidcopy: } /* Return the pagecache page at a given address within a VMA */ -static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, - unsigned long address) +static struct page *hugetlbfs_pagecache_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping; pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(vma, address); + idx = vma_hugecache_offset(h, vma, address); return find_lock_page(mapping, idx); } @@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { + struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; pgoff_t idx; unsigned long size; @@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(vma, address); + idx = vma_hugecache_offset(h, vma, address); /* * Use page lock to guard against racing truncation @@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, retry: page = find_lock_page(mapping, idx); if (!page) { - size = i_size_read(mapping->host) >> HPAGE_SHIFT; + size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; page = alloc_huge_page(vma, address, 0); @@ -1510,7 +1539,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address); + clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_SHARED) { @@ -1526,14 +1555,14 @@ retry: } spin_lock(&inode->i_lock); - inode->i_blocks += BLOCKS_PER_HUGEPAGE; + inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); } else lock_page(page); } spin_lock(&mm->page_table_lock); - size = i_size_read(mapping->host) >> HPAGE_SHIFT; + size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; int ret; static DEFINE_MUTEX(hugetlb_instantiation_mutex); + struct hstate *h = hstate_vma(vma); - ptep = huge_pte_alloc(mm, address); + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(pte_same(entry, huge_ptep_get(ptep)))) if (write_access && !pte_write(entry)) { struct page *page; - page = hugetlbfs_pagecache_page(vma, address); + page = hugetlbfs_pagecache_page(h, vma, address); ret = hugetlb_cow(mm, vma, address, ptep, entry, page); if (page) { unlock_page(page); @@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; + struct hstate *h = hstate_vma(vma); spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { @@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * each hugepage. We have to make * sure we get the * first, for the page indexing below to work. */ - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); if (!pte || huge_pte_none(huge_ptep_get(pte)) || (write && !pte_write(huge_ptep_get(pte)))) { @@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { @@ -1660,7 +1691,7 @@ same_page: --remainder; ++i; if (vaddr < vma->vm_end && remainder && - pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + pfn_offset < pages_per_huge_page(h)) { /* * We use pfn_offset to avoid touching the pageframes * of this compound page. @@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long start = address; pte_t *ptep; pte_t pte; + struct hstate *h = hstate_vma(vma); BUG_ON(address >= end); flush_cache_range(vma, address, end); spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); spin_lock(&mm->page_table_lock); - for (; address < end; address += HPAGE_SIZE) { + for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma) { long ret, chg; + struct hstate *h = hstate_inode(inode); if (vma && vma->vm_flags & VM_NORESERVE) return 0; @@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode, if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; - ret = hugetlb_acct_memory(chg); + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; @@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode, void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { + struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; + inode->i_blocks -= blocks_per_huge_page(h); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); - hugetlb_acct_memory(-(chg - freed)); + hugetlb_acct_memory(h, -(chg - freed)); } diff --git a/mm/memory.c b/mm/memory.c index 72932489a082..c1c1d6d8c22b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, if (unlikely(is_vm_hugetlb_page(vma))) { unmap_hugepage_range(vma, start, end, NULL); zap_work -= (end - start) / - (HPAGE_SIZE / PAGE_SIZE); + pages_per_huge_page(hstate_vma(vma)); start = end; } else start = unmap_page_range(*tlbp, vma, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c94e58b192c3..e550bec20582 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { zl = node_zonelist(interleave_nid(*mpol, vma, addr, - HPAGE_SHIFT), gfp_flags); + huge_page_shift(hstate_vma(vma))), gfp_flags); } else { zl = policy_zonelist(gfp_flags, *mpol); if ((*mpol)->mode == MPOL_BIND) @@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma, { unsigned long addr; struct page *page; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); - for (addr = start; addr < end; addr += HPAGE_SIZE) { - pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + for (addr = start; addr < end; addr += sz) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, + addr & huge_page_mask(h)); pte_t pte; if (!ptep) diff --git a/mm/mmap.c b/mm/mmap.c index 57d3b6097deb..5e0cc99e9cd5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1812,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, struct mempolicy *pol; struct vm_area_struct *new; - if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) + if (is_vm_hugetlb_page(vma) && (addr & + ~(huge_page_mask(hstate_vma(vma))))) return -EINVAL; if (mm->map_count >= sysctl_max_map_count) -- cgit v1.2.2 From e5ff215941d59f8ae6bf58f6428dc5c26745a612 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:42 -0700 Subject: hugetlb: multiple hstates for multiple page sizes Add basic support for more than one hstate in hugetlbfs. This is the key to supporting multiple hugetlbfs page sizes at once. - Rather than a single hstate, we now have an array, with an iterator - default_hstate continues to be the struct hstate which we use by default - Add functions for architectures to register new hstates [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 121 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0d8153e25f09..82378d44a0c5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,12 +22,19 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -unsigned long max_huge_pages; -unsigned long sysctl_overcommit_huge_pages; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -struct hstate default_hstate; +static int max_hstate; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; + +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -454,13 +461,24 @@ static void update_and_free_page(struct hstate *h, struct page *page) __free_pages(page, huge_page_order(h)); } +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; +} + static void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct hstate *h = &default_hstate; + struct hstate *h = page_hstate(page); int nid = page_to_nid(page); struct address_space *mapping; @@ -887,39 +905,94 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -static int __init hugetlb_init(void) +static void __init hugetlb_init_one_hstate(struct hstate *h) { unsigned long i; - struct hstate *h = &default_hstate; - - if (HPAGE_SHIFT == 0) - return 0; - - if (!h->order) { - h->order = HPAGE_SHIFT - PAGE_SHIFT; - h->mask = HPAGE_MASK; - } for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); h->hugetlb_next_nid = first_node(node_online_map); - for (i = 0; i < max_huge_pages; ++i) { + for (i = 0; i < h->max_huge_pages; ++i) { if (!alloc_fresh_huge_page(h)) break; } - max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; - printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n", - h->free_huge_pages); + h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; +} + +static void __init hugetlb_init_hstates(void) +{ + struct hstate *h; + + for_each_hstate(h) { + hugetlb_init_one_hstate(h); + } +} + +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + printk(KERN_INFO "Total HugeTLB memory allocated, " + "%ld %dMB pages\n", + h->free_huge_pages, + 1 << (h->order + PAGE_SHIFT - 20)); + } +} + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + } + default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + + hugetlb_init_hstates(); + + report_hugepages(); + return 0; } module_init(hugetlb_init); +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + hugetlb_init_one_hstate(h); + parsed_hstate = h; +} + static int __init hugetlb_setup(char *s) { - if (sscanf(s, "%lu", &max_huge_pages) <= 0) - max_huge_pages = 0; + unsigned long *mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + return 1; } __setup("hugepages=", hugetlb_setup); @@ -950,7 +1023,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count) if (PageHighMem(page)) continue; list_del(&page->lru); - update_and_free_page(page); + update_and_free_page(h, page); h->free_huge_pages--; h->free_huge_pages_node[page_to_nid(page)]--; } @@ -963,10 +1036,9 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) #endif #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(unsigned long count) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; - struct hstate *h = &default_hstate; /* * Increase the pool size @@ -1037,8 +1109,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->max_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - max_huge_pages = set_max_huge_pages(max_huge_pages); + + if (write) + h->max_huge_pages = set_max_huge_pages(h, tmp); + return 0; } @@ -1059,10 +1142,21 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->nr_overcommit_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - spin_lock(&hugetlb_lock); - h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; - spin_unlock(&hugetlb_lock); + + if (write) { + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock(&hugetlb_lock); + } + return 0; } -- cgit v1.2.2 From a137e1cc6d6e7d315fef03962a2a5a113348b13b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:43 -0700 Subject: hugetlbfs: per mount huge page sizes Add the ability to configure the hugetlb hstate used on a per mount basis. - Add a new pagesize= option to the hugetlbfs mount that allows setting the page size - This option causes the mount code to find the hstate corresponding to the specified size, and sets up a pointer to the hstate in the mount's superblock. - Change the hstate accessors to use this information rather than the global_hstate they were using (requires a slight change in mm/memory.c so we don't NULL deref in the error-unmap path -- see comments). [np: take hstate out of hugetlbfs inode and vma->vm_private_data] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 16 +++------------- mm/memory.c | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 82378d44a0c5..4cf7a90e9140 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1439,19 +1439,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - /* - * It is undesirable to test vma->vm_file as it should be non-null - * for valid hugetlb area. However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file before calling this function - * to clean up. Since no pte has actually been setup, it is safe to - * do nothing in this case. - */ - if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - __unmap_hugepage_range(vma, start, end, ref_page); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); - } + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + __unmap_hugepage_range(vma, start, end, ref_page); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); } /* diff --git a/mm/memory.c b/mm/memory.c index c1c1d6d8c22b..02fc6b1047b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -901,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } if (unlikely(is_vm_hugetlb_page(vma))) { - unmap_hugepage_range(vma, start, end, NULL); - zap_work -= (end - start) / + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + unmap_hugepage_range(vma, start, end, NULL); + zap_work -= (end - start) / pages_per_huge_page(hstate_vma(vma)); + } + start = end; } else start = unmap_page_range(*tlbp, vma, -- cgit v1.2.2 From a3437870160cf2caaac6bdd76c7377a5a4145a8c Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 23 Jul 2008 21:27:44 -0700 Subject: hugetlb: new sysfs interface Provide new hugepages user APIs that are more suited to multiple hstates in sysfs. There is a new directory, /sys/kernel/hugepages. Underneath that directory there will be a directory per-supported hugepage size, e.g.: /sys/kernel/hugepages/hugepages-64kB /sys/kernel/hugepages/hugepages-16384kB /sys/kernel/hugepages/hugepages-16777216kB corresponding to 64k, 16m and 16g respectively. Within each hugepages-size directory there are a number of files, corresponding to the tracked counters in the hstate, e.g.: /sys/kernel/hugepages/hugepages-64/nr_hugepages /sys/kernel/hugepages/hugepages-64/nr_overcommit_hugepages /sys/kernel/hugepages/hugepages-64/free_hugepages /sys/kernel/hugepages/hugepages-64/resv_hugepages /sys/kernel/hugepages/hugepages-64/surplus_hugepages Of these files, the first two are read-write and the latter three are read-only. The size of the hugepage being manipulated is trivially deducible from the enclosing directory and is always expressed in kB (to match meminfo). [dave@linux.vnet.ibm.com: fix build] [nacc@us.ibm.com: hugetlb: hang off of /sys/kernel/mm rather than /sys/kernel] [nacc@us.ibm.com: hugetlb: remove CONFIG_SYSFS dependency] Acked-by: Greg Kroah-Hartman Signed-off-by: Nishanth Aravamudan Signed-off-by: Nick Piggin Cc: Dave Hansen Signed-off-by: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 288 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 222 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4cf7a90e9140..bb49ce5d0067 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -942,72 +943,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; - } - default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; - - hugetlb_init_hstates(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_one_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". - */ - if (!max_hstate) - mhp = &default_hstate_max_huge_pages; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -1105,6 +1040,227 @@ out: return ret; } +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input); + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, + hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], + &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) { + kobject_put(hstate_kobjs[h - hstates]); + } + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + } + default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + + hugetlb_init_hstates(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + hugetlb_init_one_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.2 From 5ced66c901f1cf0b684feb15c2cd8b126e263d07 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:45 -0700 Subject: hugetlb: abstract numa round robin selection Need this as a separate function for a future patch. No behaviour change. Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb49ce5d0067..5e620e25cf08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,6 +565,27 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } +/* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ +static int hstate_next_node(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->hugetlb_next_nid = next_nid; + return next_nid; +} + static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; @@ -578,21 +599,7 @@ static int alloc_fresh_huge_page(struct hstate *h) page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; - /* - * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. - */ - next_nid = next_node(h->hugetlb_next_nid, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + next_nid = hstate_next_node(h); } while (!page && h->hugetlb_next_nid != start_nid); if (ret) -- cgit v1.2.2 From b54bbf7b81170f03597c17dd0b559e3006bc9868 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:45 -0700 Subject: mm: introduce non panic alloc_bootmem Straight forward variant of the existing __alloc_bootmem_node, only subsequent patch when allocating giant hugepages at boot -- don't want to panic if we can't allocate as many as the user asked for. Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 4bc6ae2fbaa3..9ac972535fff 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -578,6 +578,18 @@ void * __init alloc_bootmem_section(unsigned long size, } #endif +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif -- cgit v1.2.2 From 01ad1c0827db5b3695c53e296dbb2c1da16a0911 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:46 -0700 Subject: mm: export prep_compound_page to mm hugetlb will need to get compound pages from bootmem to handle the case of them being greater than or equal to MAX_ORDER. Export the constructor function needed for this. Acked-by: Adam Litke Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 ++ mm/page_alloc.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 858ad01864dc..1f43f7416972 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,6 +16,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +extern void prep_compound_page(struct page *page, unsigned long order); + static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e43aae135b38..eaa86671ebbd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -264,7 +264,7 @@ static void free_compound_page(struct page *page) __free_pages_ok(page, compound_order(page)); } -static void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; -- cgit v1.2.2 From aa888a74977a8f2120ae9332376e179c39a6b07d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:47 -0700 Subject: hugetlb: support larger than MAX_ORDER This is needed on x86-64 to handle GB pages in hugetlbfs, because it is not practical to enlarge MAX_ORDER to 1GB. Instead the 1GB pages are only allocated at boot using the bootmem allocator using the hugepages=... option. These 1G bootmem pages are never freed. In theory it would be possible to implement that with some complications, but since it would be a one-way street (>= MAX_ORDER pages cannot be allocated later) I decided not to currently. The >= MAX_ORDER code is not ifdef'ed per architecture. It is not very big and the ifdef uglyness seemed not be worth it. Known problems: /proc/meminfo and "free" do not display the memory allocated for gb pages in "Total". This is a little confusing for the user. Acked-by: Andrew Hastings Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5e620e25cf08..1a6fe87555b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -489,7 +490,7 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (h->surplus_huge_pages_node[nid]) { + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -550,6 +551,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; + if (h->order >= MAX_ORDER) + return NULL; + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -616,6 +620,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, struct page *page; unsigned int nid; + if (h->order >= MAX_ORDER) + return NULL; + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -792,6 +799,10 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { @@ -913,6 +924,63 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +static __initdata LIST_HEAD(huge_boot_pages); + +struct huge_bootmem_page { + struct list_head list; + struct hstate *hstate; +}; + +static int __init alloc_bootmem_huge_page(struct hstate *h) +{ + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); + + while (nr_nodes) { + void *addr; + + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); + + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + if (m) + goto found; + } + hstate_next_node(h); + nr_nodes--; + } + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; +} + +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) +{ + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); + } +} + static void __init hugetlb_init_one_hstate(struct hstate *h) { unsigned long i; @@ -923,7 +991,10 @@ static void __init hugetlb_init_one_hstate(struct hstate *h) h->hugetlb_next_nid = first_node(node_online_map); for (i = 0; i < h->max_huge_pages; ++i) { - if (!alloc_fresh_huge_page(h)) + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) break; } h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; @@ -956,6 +1027,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count) { int i; + if (h->order >= MAX_ORDER) + return; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; @@ -982,6 +1056,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + /* * Increase the pool size * First take pages out of surplus state. Then make up the @@ -1210,6 +1287,8 @@ static int __init hugetlb_init(void) hugetlb_init_hstates(); + gather_bootmem_prealloc(); + report_hugepages(); hugetlb_sysfs_init(); -- cgit v1.2.2 From 8faa8b077b2cdc4e4646842fe50b07840955a013 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:48 -0700 Subject: hugetlb: support boot allocate different sizes Make some infrastructure changes to allow boot-time allocation of different hugepage page sizes. - move all basic hstate initialisation into hugetlb_add_hstate - create a new function hugetlb_hstate_alloc_pages() to do the actual initial page allocations. Call this function early in order to allocate giant pages from bootmem. - Check for multiple hugepages= parameters Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Acked-by: Andrew Hastings Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a6fe87555b2..243a8684d180 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -981,15 +981,10 @@ static void __init gather_bootmem_prealloc(void) } } -static void __init hugetlb_init_one_hstate(struct hstate *h) +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; - for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&h->hugepage_freelists[i]); - - h->hugetlb_next_nid = first_node(node_online_map); - for (i = 0; i < h->max_huge_pages; ++i) { if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) @@ -997,7 +992,7 @@ static void __init hugetlb_init_one_hstate(struct hstate *h) } else if (!alloc_fresh_huge_page(h)) break; } - h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; + h->max_huge_pages = i; } static void __init hugetlb_init_hstates(void) @@ -1005,7 +1000,9 @@ static void __init hugetlb_init_hstates(void) struct hstate *h; for_each_hstate(h) { - hugetlb_init_one_hstate(h); + /* oversize hugepages were init'ed in early boot */ + if (h->order < MAX_ORDER) + hugetlb_hstate_alloc_pages(h); } } @@ -1301,6 +1298,8 @@ module_init(hugetlb_init); void __init hugetlb_add_hstate(unsigned order) { struct hstate *h; + unsigned long i; + if (size_to_hstate(PAGE_SIZE << order)) { printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); return; @@ -1310,15 +1309,21 @@ void __init hugetlb_add_hstate(unsigned order) h = &hstates[max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->nr_huge_pages = 0; + h->free_huge_pages = 0; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + h->hugetlb_next_nid = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - hugetlb_init_one_hstate(h); + parsed_hstate = h; } static int __init hugetlb_setup(char *s) { unsigned long *mhp; + static unsigned long *last_mhp; /* * !max_hstate means we haven't parsed a hugepagesz= parameter yet, @@ -1329,9 +1334,25 @@ static int __init hugetlb_setup(char *s) else mhp = &parsed_hstate->max_huge_pages; + if (mhp == last_mhp) { + printk(KERN_WARNING "hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); + return 1; + } + if (sscanf(s, "%lu", mhp) <= 0) *mhp = 0; + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate && parsed_hstate->order >= MAX_ORDER) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + return 1; } __setup("hugepages=", hugetlb_setup); -- cgit v1.2.2 From 4abd32dbab201c3ced0b0af12accea77cd9eeffc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:49 -0700 Subject: hugetlb: printk cleanup - Reword sentence to clarify meaning with multiple options - Add support for using GB prefixes for the page size - Add extra printk to delayed > MAX_ORDER allocation code Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 243a8684d180..0c74c14dd2f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1006,15 +1006,27 @@ static void __init hugetlb_init_hstates(void) } } +static char * __init memfmt(char *buf, unsigned long n) +{ + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} + static void __init report_hugepages(void) { struct hstate *h; for_each_hstate(h) { - printk(KERN_INFO "Total HugeTLB memory allocated, " - "%ld %dMB pages\n", - h->free_huge_pages, - 1 << (h->order + PAGE_SHIFT - 20)); + char buf[32]; + printk(KERN_INFO "HugeTLB registered %s page size, " + "pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); } } -- cgit v1.2.2 From ceb868796181dc95ea01a110e123afd391639873 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:50 -0700 Subject: hugetlb: introduce pud_huge Straight forward extensions for huge pages located in the PUD instead of PMDs. Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++++++ mm/memory.c | 15 +++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c74c14dd2f7..107c1ce223cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1996,6 +1996,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* Can be overriden by architectures */ +__attribute__((weak)) struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, diff --git a/mm/memory.c b/mm/memory.c index 02fc6b1047b0..262e3eb6601a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -998,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) goto no_page_table; - + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } - if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1567,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err; + BUG_ON(pud_huge(*pud)); + pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; -- cgit v1.2.2 From e11bfbfcb08ef4223b863799897c19cdf7c5bc00 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 23 Jul 2008 21:27:52 -0700 Subject: hugetlb: override default huge page size Allow configurations with the default huge page size which is different to the traditional HPAGE_SIZE size. The default huge page size is the one represented in the legacy /proc ABIs, SHM, and which is defaulted to when mounting hugetlbfs filesystems. This is implemented with a new kernel option default_hugepagesz=, which defaults to HPAGE_SIZE if not specified. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 107c1ce223cb..2a2f6e869401 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ struct hstate hstates[HUGE_MAX_HSTATE]; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) @@ -1288,11 +1289,14 @@ static int __init hugetlb_init(void) { BUILD_BUG_ON(HPAGE_SHIFT == 0); - if (!size_to_hstate(HPAGE_SIZE)) { - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + if (!size_to_hstate(default_hstate_size)) { + default_hstate_size = HPAGE_SIZE; + if (!size_to_hstate(default_hstate_size)) + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } - default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + if (default_hstate_max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); @@ -1332,7 +1336,7 @@ void __init hugetlb_add_hstate(unsigned order) parsed_hstate = h; } -static int __init hugetlb_setup(char *s) +static int __init hugetlb_nrpages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; @@ -1367,7 +1371,14 @@ static int __init hugetlb_setup(char *s) return 1; } -__setup("hugepages=", hugetlb_setup); +__setup("hugepages=", hugetlb_nrpages_setup); + +static int __init hugetlb_default_setup(char *s) +{ + default_hstate_size = memparse(s, &s); + return 1; +} +__setup("default_hugepagesz=", hugetlb_default_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { -- cgit v1.2.2 From 53ba51d21d6e048424ab8aadfebdb1f25ae07b60 Mon Sep 17 00:00:00 2001 From: Jon Tollefson Date: Wed, 23 Jul 2008 21:27:52 -0700 Subject: hugetlb: allow arch overridden hugepage allocation Allow alloc_bootmem_huge_page() to be overridden by architectures that can't always use bootmem. This requires huge_boot_pages to be available for use by this function. This is required for powerpc 16G pages, which have to be reserved prior to boot-time. The location of these pages are indicated in the device tree. Acked-by: Adam Litke Signed-off-by: Jon Tollefson Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2a2f6e869401..3e1506b808a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,6 +31,8 @@ static int max_hstate; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata LIST_HEAD(huge_boot_pages); + /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; @@ -925,14 +927,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -static __initdata LIST_HEAD(huge_boot_pages); - -struct huge_bootmem_page { - struct list_head list; - struct hstate *hstate; -}; - -static int __init alloc_bootmem_huge_page(struct hstate *h) +__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes = nodes_weight(node_online_map); -- cgit v1.2.2 From 7f09ca51e925ba62e9ebfd4979f093e97e38adeb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:58 -0700 Subject: hugetlb: fix a hugepage reservation check for MAP_SHARED When removing a huge page from the hugepage pool for a fault the system checks to see if the mapping requires additional pages to be reserved, and if it does whether there are any unreserved pages remaining. If not, the allocation fails without even attempting to get a page. In order to determine whether to apply this check we call vma_has_private_reserves() which tells us if this vma is MAP_PRIVATE and is the owner. This incorrectly triggers the remaining reservation test for MAP_SHARED mappings which prevents allocation of the final page in the pool even though it is reserved for this mapping. In reality we only want to check this for MAP_PRIVATE mappings where the process is not the original mapper. Replace vma_has_private_reserves() with vma_has_reserves() which indicates whether further reserves are required, and update the caller. Signed-off-by: Mel Gorman Acked-by: Adam Litke Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e1506b808a3..8c20aed62b9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -342,13 +342,13 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_private_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) - return 0; - if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 0; - return 1; + return 1; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + return 0; } static void clear_huge_page(struct page *page, @@ -420,7 +420,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_private_reserves(vma) && + if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; -- cgit v1.2.2 From 7251ff78b94c2a68d267623d09b32672b20662c1 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Wed, 23 Jul 2008 21:27:59 -0700 Subject: hugetlb: quota is not freed for unused reserved private huge pages With shared reservations (and now also with private reservations), we reserve huge pages at mmap time. We also account for the mapping against fs quota to prevent a reservation from being preempted by quota exhaustion. When testing with the libhugetlbfs test suite, I found a problem with quota accounting. FS quota for allocated pages is handled correctly but we are not releasing quota for private pages that were reserved but never allocated. Do this in hugetlb_vm_op_close() at the same time as unused page reservations are released. Signed-off-by: Adam Litke Cc: Mel Gorman Cc: Johannes Weiner Cc: William Lee Irwin III Cc: Hugh Dickins Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8c20aed62b9c..41341c414194 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1552,8 +1552,10 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); - if (reserve) + if (reserve) { hugetlb_acct_memory(h, -reserve); + hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + } } } -- cgit v1.2.2 From 223e8dc9249c9e15f6c8b638d73fcad78ccb0a88 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:00 -0700 Subject: bootmem: reorder code to match new bootmem structure This only reorders functions so that further patches will be easier to read. No code changed. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 356 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 177 insertions(+), 179 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 9ac972535fff..24eacf52c50e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -38,6 +38,19 @@ unsigned long saved_max_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; +/* + * Given an initialised bdata, it returns the size of the boot bitmap + */ +static unsigned long __init get_mapsize(bootmem_data_t *bdata) +{ + unsigned long mapsize; + unsigned long start = PFN_DOWN(bdata->node_boot_start); + unsigned long end = bdata->node_low_pfn; + + mapsize = ((end - start) + 7) / 8; + return ALIGN(mapsize, sizeof(long)); +} + /* return the number of _pages_ that will be allocated for the boot bitmap */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { @@ -71,19 +84,6 @@ static void __init link_bootmem(bootmem_data_t *bdata) list_add_tail(&bdata->list, &bdata_list); } -/* - * Given an initialised bdata, it returns the size of the boot bitmap - */ -static unsigned long __init get_mapsize(bootmem_data_t *bdata) -{ - unsigned long mapsize; - unsigned long start = PFN_DOWN(bdata->node_boot_start); - unsigned long end = bdata->node_low_pfn; - - mapsize = ((end - start) + 7) / 8; - return ALIGN(mapsize, sizeof(long)); -} - /* * Called once to set up the allocator itself. */ @@ -108,6 +108,146 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, return mapsize; } +unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, + unsigned long startpfn, unsigned long endpfn) +{ + return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); +} + +unsigned long __init init_bootmem(unsigned long start, unsigned long pages) +{ + max_low_pfn = pages; + min_low_pfn = start; + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); +} + +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) +{ + struct page *page; + unsigned long pfn; + unsigned long i, count; + unsigned long idx; + unsigned long *map; + int gofast = 0; + + BUG_ON(!bdata->node_bootmem_map); + + count = 0; + /* first extant page of the node */ + pfn = PFN_DOWN(bdata->node_boot_start); + idx = bdata->node_low_pfn - pfn; + map = bdata->node_bootmem_map; + /* + * Check if we are aligned to BITS_PER_LONG pages. If so, we might + * be able to free page orders of that size at once. + */ + if (!(pfn & (BITS_PER_LONG-1))) + gofast = 1; + + for (i = 0; i < idx; ) { + unsigned long v = ~map[i / BITS_PER_LONG]; + + if (gofast && v == ~0UL) { + int order; + + page = pfn_to_page(pfn); + count += BITS_PER_LONG; + order = ffs(BITS_PER_LONG) - 1; + __free_pages_bootmem(page, order); + i += BITS_PER_LONG; + page += BITS_PER_LONG; + } else if (v) { + unsigned long m; + + page = pfn_to_page(pfn); + for (m = 1; m && i < idx; m<<=1, page++, i++) { + if (v & m) { + count++; + __free_pages_bootmem(page, 0); + } + } + } else { + i += BITS_PER_LONG; + } + pfn += BITS_PER_LONG; + } + + /* + * Now free the allocator bitmap itself, it's not + * needed anymore: + */ + page = virt_to_page(bdata->node_bootmem_map); + idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; + for (i = 0; i < idx; i++, page++) + __free_pages_bootmem(page, 0); + count += i; + bdata->node_bootmem_map = NULL; + + return count; +} + +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + return free_all_bootmem_core(pgdat->bdata); +} + +unsigned long __init free_all_bootmem(void) +{ + return free_all_bootmem_core(NODE_DATA(0)->bdata); +} + +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, + unsigned long size) +{ + unsigned long sidx, eidx; + unsigned long i; + + BUG_ON(!size); + + /* out range */ + if (addr + size < bdata->node_boot_start || + PFN_DOWN(addr) > bdata->node_low_pfn) + return; + /* + * round down end of usable mem, partially free pages are + * considered reserved. + */ + + if (addr >= bdata->node_boot_start && addr < bdata->last_success) + bdata->last_success = addr; + + /* + * Round up to index to the range. + */ + if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) + sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + else + sidx = 0; + + eidx = PFN_DOWN(addr + size - bdata->node_boot_start); + if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) + eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + + for (i = sidx; i < eidx; i++) { + if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) + BUG(); + } +} + +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + free_bootmem_core(pgdat->bdata, physaddr, size); +} + +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + bootmem_data_t *bdata; + list_for_each_entry(bdata, &bdata_list, list) + free_bootmem_core(bdata, addr, size); +} + /* * Marks a particular physical memory range as unallocatable. Usable RAM * might be used for boot-time allocations - or it might get added @@ -183,43 +323,36 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, } } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, - unsigned long size) +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size, int flags) { - unsigned long sidx, eidx; - unsigned long i; - - BUG_ON(!size); - - /* out range */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) - return; - /* - * round down end of usable mem, partially free pages are - * considered reserved. - */ - - if (addr >= bdata->node_boot_start && addr < bdata->last_success) - bdata->last_success = addr; + int ret; - /* - * Round up to index to the range. - */ - if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) - sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); - else - sidx = 0; + ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + if (ret < 0) + return -ENOMEM; + reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + return 0; +} - eidx = PFN_DOWN(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +int __init reserve_bootmem(unsigned long addr, unsigned long size, + int flags) +{ + bootmem_data_t *bdata; + int ret; - for (i = sidx; i < eidx; i++) { - if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) - BUG(); + list_for_each_entry(bdata, &bdata_list, list) { + ret = can_reserve_bootmem_core(bdata, addr, size, flags); + if (ret < 0) + return ret; } + list_for_each_entry(bdata, &bdata_list, list) + reserve_bootmem_core(bdata, addr, size, flags); + + return 0; } +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ /* * We 'merge' subsequent allocations to save space. We might 'lose' @@ -371,140 +504,6 @@ found: return ret; } -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) -{ - struct page *page; - unsigned long pfn; - unsigned long i, count; - unsigned long idx; - unsigned long *map; - int gofast = 0; - - BUG_ON(!bdata->node_bootmem_map); - - count = 0; - /* first extant page of the node */ - pfn = PFN_DOWN(bdata->node_boot_start); - idx = bdata->node_low_pfn - pfn; - map = bdata->node_bootmem_map; - /* - * Check if we are aligned to BITS_PER_LONG pages. If so, we might - * be able to free page orders of that size at once. - */ - if (!(pfn & (BITS_PER_LONG-1))) - gofast = 1; - - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - - if (gofast && v == ~0UL) { - int order; - - page = pfn_to_page(pfn); - count += BITS_PER_LONG; - order = ffs(BITS_PER_LONG) - 1; - __free_pages_bootmem(page, order); - i += BITS_PER_LONG; - page += BITS_PER_LONG; - } else if (v) { - unsigned long m; - - page = pfn_to_page(pfn); - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; - __free_pages_bootmem(page, 0); - } - } - } else { - i += BITS_PER_LONG; - } - pfn += BITS_PER_LONG; - } - - /* - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ - page = virt_to_page(bdata->node_bootmem_map); - idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; - for (i = 0; i < idx; i++, page++) - __free_pages_bootmem(page, 0); - count += i; - bdata->node_bootmem_map = NULL; - - return count; -} - -unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, - unsigned long startpfn, unsigned long endpfn) -{ - return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); -} - -int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size, int flags) -{ - int ret; - - ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); - if (ret < 0) - return -ENOMEM; - reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); - - return 0; -} - -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - free_bootmem_core(pgdat->bdata, physaddr, size); -} - -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) -{ - register_page_bootmem_info_node(pgdat); - return free_all_bootmem_core(pgdat->bdata); -} - -unsigned long __init init_bootmem(unsigned long start, unsigned long pages) -{ - max_low_pfn = pages; - min_low_pfn = start; - return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); -} - -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -int __init reserve_bootmem(unsigned long addr, unsigned long size, - int flags) -{ - bootmem_data_t *bdata; - int ret; - - list_for_each_entry(bdata, &bdata_list, list) { - ret = can_reserve_bootmem_core(bdata, addr, size, flags); - if (ret < 0) - return ret; - } - list_for_each_entry(bdata, &bdata_list, list) - reserve_bootmem_core(bdata, addr, size, flags); - - return 0; -} -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ - -void __init free_bootmem(unsigned long addr, unsigned long size) -{ - bootmem_data_t *bdata; - list_for_each_entry(bdata, &bdata_list, list) - free_bootmem_core(bdata, addr, size); -} - -unsigned long __init free_all_bootmem(void) -{ - return free_all_bootmem_core(NODE_DATA(0)->bdata); -} - void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { @@ -534,7 +533,6 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return NULL; } - void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { -- cgit v1.2.2 From 57cfc29efac6670355ee0e107c8dbae8237d406b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:00 -0700 Subject: bootmem: clean up bootmem.c file header Change the description, move a misplaced comment about the allocator itself and add me to the list of copyright holders. Signed-off-by: Johannes Weiner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 24eacf52c50e..286e12c536ae 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -1,12 +1,12 @@ /* - * linux/mm/bootmem.c + * bootmem - A boot-time physical memory allocator and configurator * * Copyright (C) 1999 Ingo Molnar - * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner * - * simple boot-time physical memory area allocator and - * free memory collector. It's used to deal with reserved - * system memory and memory holes as well. + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). */ #include #include @@ -19,10 +19,6 @@ #include "internal.h" -/* - * Access to this subsystem has to be serialized externally. (this is - * true for the boot process anyway) - */ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -- cgit v1.2.2 From a66fd7daec1f40c1f0eac466f0da9206b615fe2a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:01 -0700 Subject: bootmem: add documentation to API functions Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 286e12c536ae..105ad4cff2e1 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -47,7 +47,10 @@ static unsigned long __init get_mapsize(bootmem_data_t *bdata) return ALIGN(mapsize, sizeof(long)); } -/* return the number of _pages_ that will be allocated for the boot bitmap */ +/** + * bootmem_bootmap_pages - calculate bitmap size in pages + * @pages: number of pages the bitmap has to represent + */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { unsigned long mapsize; @@ -104,12 +107,28 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, return mapsize; } +/** + * init_bootmem_node - register a node as boot memory + * @pgdat: node to register + * @freepfn: pfn where the bitmap for this node is to be placed + * @startpfn: first pfn on the node + * @endpfn: first pfn after the node + * + * Returns the number of bytes needed to hold the bitmap for this node. + */ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) { return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); } +/** + * init_bootmem - register boot memory + * @start: pfn where the bitmap is to be placed + * @pages: number of available physical pages + * + * Returns the number of bytes needed to hold the bitmap. + */ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { max_low_pfn = pages; @@ -182,12 +201,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); return free_all_bootmem_core(pgdat->bdata); } +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ unsigned long __init free_all_bootmem(void) { return free_all_bootmem_core(NODE_DATA(0)->bdata); @@ -231,12 +261,32 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, } } +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * Only physical pages that actually reside on @pgdat are marked. + */ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { free_bootmem_core(pgdat->bdata, physaddr, size); } +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * All physical pages within the range are marked, no matter what + * node they reside on. + */ void __init free_bootmem(unsigned long addr, unsigned long size) { bootmem_data_t *bdata; @@ -319,6 +369,17 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, } } +/** + * reserve_bootmem_node - mark a page range as reserved + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * Only physical pages that actually reside on @pgdat are marked. + */ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { @@ -332,6 +393,17 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, } #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +/** + * reserve_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * All physical pages within the range are marked, no matter what + * node they reside on. + */ int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { @@ -500,6 +572,19 @@ found: return ret; } +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { @@ -514,6 +599,19 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, return NULL; } +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { @@ -529,6 +627,21 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return NULL; } +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { @@ -542,6 +655,13 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, } #ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { @@ -588,6 +708,19 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { @@ -609,6 +742,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return NULL; } +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { -- cgit v1.2.2 From 2e5237daf0cc3c8d87762f53f704dc54fa91dcf6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:02 -0700 Subject: bootmem: add debugging framework Introduce the bootmem_debug kernel parameter that enables very verbose diagnostics regarding all range operations of bootmem as well as the initialization and release of nodes. [akpm@linux-foundation.org: fix printk warnings] Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 105ad4cff2e1..4e085ee1d98e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -34,6 +34,22 @@ unsigned long saved_max_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; +static int bootmem_debug; + +static int __init bootmem_debug_setup(char *buf) +{ + bootmem_debug = 1; + return 0; +} +early_param("bootmem_debug", bootmem_debug_setup); + +#define bdebug(fmt, args...) ({ \ + if (unlikely(bootmem_debug)) \ + printk(KERN_INFO \ + "bootmem::%s " fmt, \ + __FUNCTION__, ## args); \ +}) + /* * Given an initialised bdata, it returns the size of the boot bitmap */ @@ -104,6 +120,9 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, mapsize = get_mapsize(bdata); memset(bdata->node_bootmem_map, 0xff, mapsize); + bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", + bdata - bootmem_node_data, start, mapstart, end, mapsize); + return mapsize; } @@ -198,6 +217,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) count += i; bdata->node_bootmem_map = NULL; + bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); + return count; } @@ -255,6 +276,10 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, + sidx + PFN_DOWN(bdata->node_boot_start), + eidx + PFN_DOWN(bdata->node_boot_start)); + for (i = sidx; i < eidx; i++) { if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) BUG(); @@ -360,13 +385,16 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); - for (i = sidx; i < eidx; i++) { - if (test_and_set_bit(i, bdata->node_bootmem_map)) { -#ifdef CONFIG_DEBUG_BOOTMEM - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); -#endif - } - } + bdebug("nid=%td start=%lx end=%lx flags=%x\n", + bdata - bootmem_node_data, + sidx + PFN_DOWN(bdata->node_boot_start), + eidx + PFN_DOWN(bdata->node_boot_start), + flags); + + for (i = sidx; i < eidx; i++) + if (test_and_set_bit(i, bdata->node_bootmem_map)) + bdebug("hm, page %lx reserved twice.\n", + PFN_DOWN(bdata->node_boot_start) + i); } /** @@ -455,6 +483,10 @@ alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, if (!bdata->node_bootmem_map) return NULL; + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); + /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ node_boot_start = bdata->node_boot_start; node_bootmem_map = bdata->node_bootmem_map; @@ -562,6 +594,11 @@ found: ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); } + bdebug("nid=%td start=%lx end=%lx\n", + bdata - bootmem_node_data, + start + PFN_DOWN(bdata->node_boot_start), + start + areasize + PFN_DOWN(bdata->node_boot_start)); + /* * Reserve the area now: */ -- cgit v1.2.2 From df049a5f41a3b2eee2131221959e3b558ba7c705 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:02 -0700 Subject: bootmem: revisit bitmap size calculations Reincarnate get_mapsize as bootmap_bytes and implement bootmem_bootmap_pages on top of it. Adjust users of these helpers and make free_all_bootmem_core use bootmem_bootmap_pages instead of open-coding it. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 4e085ee1d98e..484849bfc8c4 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -50,17 +50,11 @@ early_param("bootmem_debug", bootmem_debug_setup); __FUNCTION__, ## args); \ }) -/* - * Given an initialised bdata, it returns the size of the boot bitmap - */ -static unsigned long __init get_mapsize(bootmem_data_t *bdata) +static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long mapsize; - unsigned long start = PFN_DOWN(bdata->node_boot_start); - unsigned long end = bdata->node_low_pfn; + unsigned long bytes = (pages + 7) / 8; - mapsize = ((end - start) + 7) / 8; - return ALIGN(mapsize, sizeof(long)); + return ALIGN(bytes, sizeof(long)); } /** @@ -69,13 +63,9 @@ static unsigned long __init get_mapsize(bootmem_data_t *bdata) */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { - unsigned long mapsize; - - mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; + unsigned long bytes = bootmap_bytes(pages); - return mapsize; + return PAGE_ALIGN(bytes) >> PAGE_SHIFT; } /* @@ -117,7 +107,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ - mapsize = get_mapsize(bdata); + mapsize = bootmap_bytes(end - start); memset(bdata->node_bootmem_map, 0xff, mapsize); bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", @@ -160,7 +150,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) struct page *page; unsigned long pfn; unsigned long i, count; - unsigned long idx; + unsigned long idx, pages; unsigned long *map; int gofast = 0; @@ -211,7 +201,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) * needed anymore: */ page = virt_to_page(bdata->node_bootmem_map); - idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; + pages = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + idx = bootmem_bootmap_pages(pages); for (i = 0; i < idx; i++, page++) __free_pages_bootmem(page, 0); count += i; -- cgit v1.2.2 From 636cc40cb79f511d9caa27ef098a83e4fa4971fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:03 -0700 Subject: bootmem: revisit bootmem descriptor list handling link_bootmem handles an insertion of a new descriptor into the sorted list in more or less three explicit branches; empty list, insert in between and append. These cases can be expressed implicite. Also mark the sorted list as initdata as it can be thrown away after boot as well. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 484849bfc8c4..9da7d4097810 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -23,7 +23,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP /* * If we have booted due to a crash, max_pfn will be a very low value. We need @@ -34,6 +33,8 @@ unsigned long saved_max_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; +static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); + static int bootmem_debug; static int __init bootmem_debug_setup(char *buf) @@ -73,20 +74,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) */ static void __init link_bootmem(bootmem_data_t *bdata) { - bootmem_data_t *ent; + struct list_head *iter; - if (list_empty(&bdata_list)) { - list_add(&bdata->list, &bdata_list); - return; - } - /* insert in order */ - list_for_each_entry(ent, &bdata_list, list) { - if (bdata->node_boot_start < ent->node_boot_start) { - list_add_tail(&bdata->list, &ent->list); - return; - } + list_for_each(iter, &bdata_list) { + bootmem_data_t *ent; + + ent = list_entry(iter, bootmem_data_t, list); + if (bdata->node_boot_start < ent->node_boot_start) + break; } - list_add_tail(&bdata->list, &bdata_list); + list_add_tail(&bdata->list, iter); } /* -- cgit v1.2.2 From 41546c17418fba08ece978bad72a33072715b8f3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:03 -0700 Subject: bootmem: clean up free_all_bootmem_core Rewrite the code in a more concise way using less variables. [akpm@linux-foundation.org: fix printk warnings] Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 83 ++++++++++++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 9da7d4097810..300d126ec533 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -144,66 +144,59 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { + int aligned; struct page *page; - unsigned long pfn; - unsigned long i, count; - unsigned long idx, pages; - unsigned long *map; - int gofast = 0; - - BUG_ON(!bdata->node_bootmem_map); - - count = 0; - /* first extant page of the node */ - pfn = PFN_DOWN(bdata->node_boot_start); - idx = bdata->node_low_pfn - pfn; - map = bdata->node_bootmem_map; + unsigned long start, end, pages, count = 0; + + if (!bdata->node_bootmem_map) + return 0; + + start = PFN_DOWN(bdata->node_boot_start); + end = bdata->node_low_pfn; + /* - * Check if we are aligned to BITS_PER_LONG pages. If so, we might - * be able to free page orders of that size at once. + * If the start is aligned to the machines wordsize, we might + * be able to free pages in bulks of that order. */ - if (!(pfn & (BITS_PER_LONG-1))) - gofast = 1; + aligned = !(start & (BITS_PER_LONG - 1)); + + bdebug("nid=%td start=%lx end=%lx aligned=%d\n", + bdata - bootmem_node_data, start, end, aligned); + + while (start < end) { + unsigned long *map, idx, vec; - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; + map = bdata->node_bootmem_map; + idx = start - PFN_DOWN(bdata->node_boot_start); + vec = ~map[idx / BITS_PER_LONG]; - if (gofast && v == ~0UL) { - int order; + if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + int order = ilog2(BITS_PER_LONG); - page = pfn_to_page(pfn); + __free_pages_bootmem(pfn_to_page(start), order); count += BITS_PER_LONG; - order = ffs(BITS_PER_LONG) - 1; - __free_pages_bootmem(page, order); - i += BITS_PER_LONG; - page += BITS_PER_LONG; - } else if (v) { - unsigned long m; - - page = pfn_to_page(pfn); - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; + } else { + unsigned long off = 0; + + while (vec && off < BITS_PER_LONG) { + if (vec & 1) { + page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); + count++; } + vec >>= 1; + off++; } - } else { - i += BITS_PER_LONG; } - pfn += BITS_PER_LONG; + start += BITS_PER_LONG; } - /* - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); - idx = bootmem_bootmap_pages(pages); - for (i = 0; i < idx; i++, page++) - __free_pages_bootmem(page, 0); - count += i; - bdata->node_bootmem_map = NULL; + pages = bootmem_bootmap_pages(pages); + count += pages; + while (pages--) + __free_pages_bootmem(page++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); -- cgit v1.2.2 From 5f2809e69c7128f86316048221cf45146f69a4a0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:05 -0700 Subject: bootmem: clean up alloc_bootmem_core alloc_bootmem_core has become quite nasty to read over time. This is a clean rewrite that keeps the semantics. bdata->last_pos has been dropped. bdata->last_success has been renamed to hint_idx and it is now an index relative to the node's range. Since further block searching might start at this index, it is now set to the end of a succeeded allocation rather than its beginning. bdata->last_offset has been renamed to last_end_off to be more clear that it represents the ending address of the last allocation relative to the node. [y-goto@jp.fujitsu.com: fix new alloc_bootmem_core()] Signed-off-by: Johannes Weiner Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 212 +++++++++++++++++++++-------------------------------------- 1 file changed, 76 insertions(+), 136 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 300d126ec533..94ea612deccf 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -242,8 +242,9 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, * considered reserved. */ - if (addr >= bdata->node_boot_start && addr < bdata->last_success) - bdata->last_success = addr; + if (addr >= bdata->node_boot_start && + PFN_DOWN(addr - bdata->node_boot_start) < bdata->hint_idx) + bdata->hint_idx = PFN_DOWN(addr - bdata->node_boot_start); /* * Round up to index to the range. @@ -431,36 +432,16 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ -/* - * We 'merge' subsequent allocations to save space. We might 'lose' - * some fraction of a page if allocations cannot be satisfied due to - * size constraints on boxes where there is physical RAM space - * fragmentation - in these cases (mostly large memory boxes) this - * is not a problem. - * - * On low memory boxes we get it right in 100% of the cases. - * - * alignment has to be a power of 2 value. - * - * NOTE: This function is _not_ reentrant. - */ -static void * __init -alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) +static void * __init alloc_bootmem_core(struct bootmem_data *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { - unsigned long areasize, preferred; - unsigned long i, start = 0, incr, eidx, end_pfn; - void *ret; - unsigned long node_boot_start; - void *node_bootmem_map; - - if (!size) { - printk("alloc_bootmem_core(): zero-sized request\n"); - BUG(); - } - BUG_ON(align & (align-1)); + unsigned long min, max, start, sidx, midx, step; + + BUG_ON(!size); + BUG_ON(align & (align - 1)); + BUG_ON(limit && goal + size > limit); - /* on nodes without memory - bootmem_map is NULL */ if (!bdata->node_bootmem_map) return NULL; @@ -468,126 +449,85 @@ alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, align, goal, limit); - /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ - node_boot_start = bdata->node_boot_start; - node_bootmem_map = bdata->node_bootmem_map; - if (align) { - node_boot_start = ALIGN(bdata->node_boot_start, align); - if (node_boot_start > bdata->node_boot_start) - node_bootmem_map = (unsigned long *)bdata->node_bootmem_map + - PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG; - } + min = PFN_DOWN(bdata->node_boot_start); + max = bdata->node_low_pfn; - if (limit && node_boot_start >= limit) + goal >>= PAGE_SHIFT; + limit >>= PAGE_SHIFT; + + if (limit && max > limit) + max = limit; + if (max <= min) return NULL; - end_pfn = bdata->node_low_pfn; - limit = PFN_DOWN(limit); - if (limit && end_pfn > limit) - end_pfn = limit; + step = max(align >> PAGE_SHIFT, 1UL); - eidx = end_pfn - PFN_DOWN(node_boot_start); + if (goal && min < goal && goal < max) + start = ALIGN(goal, step); + else + start = ALIGN(min, step); - /* - * We try to allocate bootmem pages above 'goal' - * first, then we try to allocate lower pages. - */ - preferred = 0; - if (goal && PFN_DOWN(goal) < end_pfn) { - if (goal > node_boot_start) - preferred = goal - node_boot_start; - - if (bdata->last_success > node_boot_start && - bdata->last_success - node_boot_start >= preferred) - if (!limit || (limit && limit > bdata->last_success)) - preferred = bdata->last_success - node_boot_start; - } + sidx = start - PFN_DOWN(bdata->node_boot_start); + midx = max - PFN_DOWN(bdata->node_boot_start); - preferred = PFN_DOWN(ALIGN(preferred, align)); - areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; + if (bdata->hint_idx > sidx) { + /* Make sure we retry on failure */ + goal = 1; + sidx = ALIGN(bdata->hint_idx, step); + } -restart_scan: - for (i = preferred; i < eidx;) { - unsigned long j; + while (1) { + int merge; + void *region; + unsigned long eidx, i, start_off, end_off; +find_block: + sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); + sidx = ALIGN(sidx, step); + eidx = sidx + PFN_UP(size); - i = find_next_zero_bit(node_bootmem_map, eidx, i); - i = ALIGN(i, incr); - if (i >= eidx) + if (sidx >= midx || eidx > midx) break; - if (test_bit(i, node_bootmem_map)) { - i += incr; - continue; - } - for (j = i + 1; j < i + areasize; ++j) { - if (j >= eidx) - goto fail_block; - if (test_bit(j, node_bootmem_map)) - goto fail_block; - } - start = i; - goto found; - fail_block: - i = ALIGN(j, incr); - if (i == j) - i += incr; - } - - if (preferred > 0) { - preferred = 0; - goto restart_scan; - } - return NULL; -found: - bdata->last_success = PFN_PHYS(start) + node_boot_start; - BUG_ON(start >= eidx); + for (i = sidx; i < eidx; i++) + if (test_bit(i, bdata->node_bootmem_map)) { + sidx = ALIGN(i, step); + if (sidx == i) + sidx += step; + goto find_block; + } - /* - * Is the next page of the previous allocation-end the start - * of this allocation's buffer? If yes then we can 'merge' - * the previous partial page with this allocation. - */ - if (align < PAGE_SIZE && - bdata->last_offset && bdata->last_pos+1 == start) { - unsigned long offset, remaining_size; - offset = ALIGN(bdata->last_offset, align); - BUG_ON(offset > PAGE_SIZE); - remaining_size = PAGE_SIZE - offset; - if (size < remaining_size) { - areasize = 0; - /* last_pos unchanged */ - bdata->last_offset = offset + size; - ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + - offset + node_boot_start); - } else { - remaining_size = size - remaining_size; - areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + - offset + node_boot_start); - bdata->last_pos = start + areasize - 1; - bdata->last_offset = remaining_size; - } - bdata->last_offset &= ~PAGE_MASK; - } else { - bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); + if (bdata->last_end_off && + PFN_DOWN(bdata->last_end_off) + 1 == sidx) + start_off = ALIGN(bdata->last_end_off, align); + else + start_off = PFN_PHYS(sidx); + + merge = PFN_DOWN(start_off) < sidx; + end_off = start_off + size; + + bdata->last_end_off = end_off; + bdata->hint_idx = PFN_UP(end_off); + + /* + * Reserve the area now: + */ + for (i = PFN_DOWN(start_off) + merge; + i < PFN_UP(end_off); i++) + if (test_and_set_bit(i, bdata->node_bootmem_map)) + BUG(); + + region = phys_to_virt(bdata->node_boot_start + start_off); + memset(region, 0, size); + return region; } - bdebug("nid=%td start=%lx end=%lx\n", - bdata - bootmem_node_data, - start + PFN_DOWN(bdata->node_boot_start), - start + areasize + PFN_DOWN(bdata->node_boot_start)); + if (goal) { + goal = 0; + sidx = 0; + goto find_block; + } - /* - * Reserve the area now: - */ - for (i = start; i < start + areasize; i++) - if (unlikely(test_and_set_bit(i, node_bootmem_map))) - BUG(); - memset(ret, 0, size); - return ret; + return NULL; } /** -- cgit v1.2.2 From d747fa4bcebcf3696607b86a6b0dafa644be0676 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:05 -0700 Subject: bootmem: free/reserve helpers Factor out the common operation of marking a range on the bitmap. [akpm@linux-foundation.org: fix various warnings] Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 65 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 94ea612deccf..9d03ff651359 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -225,6 +225,44 @@ unsigned long __init free_all_bootmem(void) return free_all_bootmem_core(NODE_DATA(0)->bdata); } +static void __init __free(bootmem_data_t *bdata, + unsigned long sidx, unsigned long eidx) +{ + unsigned long idx; + + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, + sidx + PFN_DOWN(bdata->node_boot_start), + eidx + PFN_DOWN(bdata->node_boot_start)); + + for (idx = sidx; idx < eidx; idx++) + if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) + BUG(); +} + +static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, + unsigned long eidx, int flags) +{ + unsigned long idx; + int exclusive = flags & BOOTMEM_EXCLUSIVE; + + bdebug("nid=%td start=%lx end=%lx flags=%x\n", + bdata - bootmem_node_data, + sidx + PFN_DOWN(bdata->node_boot_start), + eidx + PFN_DOWN(bdata->node_boot_start), + flags); + + for (idx = sidx; idx < eidx; idx++) + if (test_and_set_bit(idx, bdata->node_bootmem_map)) { + if (exclusive) { + __free(bdata, sidx, idx); + return -EBUSY; + } + bdebug("silent double reserve of PFN %lx\n", + idx + PFN_DOWN(bdata->node_boot_start)); + } + return 0; +} + static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) { @@ -258,14 +296,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); - bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, - sidx + PFN_DOWN(bdata->node_boot_start), - eidx + PFN_DOWN(bdata->node_boot_start)); - - for (i = sidx; i < eidx; i++) { - if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) - BUG(); - } + __free(bdata, sidx, eidx); } /** @@ -367,16 +398,7 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); - bdebug("nid=%td start=%lx end=%lx flags=%x\n", - bdata - bootmem_node_data, - sidx + PFN_DOWN(bdata->node_boot_start), - eidx + PFN_DOWN(bdata->node_boot_start), - flags); - - for (i = sidx; i < eidx; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - bdebug("hm, page %lx reserved twice.\n", - PFN_DOWN(bdata->node_boot_start) + i); + return __reserve(bdata, sidx, eidx, flags); } /** @@ -511,10 +533,9 @@ find_block: /* * Reserve the area now: */ - for (i = PFN_DOWN(start_off) + merge; - i < PFN_UP(end_off); i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - BUG(); + if (__reserve(bdata, PFN_DOWN(start_off) + merge, + PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) + BUG(); region = phys_to_virt(bdata->node_boot_start + start_off); memset(region, 0, size); -- cgit v1.2.2 From e2bf3cae515090fefe28329e71230dfe7ab873b1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:06 -0700 Subject: bootmem: factor out the marking of a PFN range Introduce new helpers that mark a range that resides completely on a node or node-agnostic ranges that might also span node boundaries. The free/reserve API functions will then directly use these helpers. Note that the free/reserve semantics become more strict: while the prior code took basically arbitrary range arguments and marked the PFNs that happen to fall into that range, the new code requires node-specific ranges to be completely on the node. The node-agnostic requests might span node boundaries as long as the nodes are contiguous. Passing ranges that do not satisfy these criteria is a bug. [akpm@linux-foundation.org: fix printk warnings] Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 188 ++++++++++++++++++++++------------------------------------- 1 file changed, 69 insertions(+), 119 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 9d03ff651359..e5415a5414a5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -234,6 +234,9 @@ static void __init __free(bootmem_data_t *bdata, sidx + PFN_DOWN(bdata->node_boot_start), eidx + PFN_DOWN(bdata->node_boot_start)); + if (bdata->hint_idx > sidx) + bdata->hint_idx = sidx; + for (idx = sidx; idx < eidx; idx++) if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) BUG(); @@ -263,40 +266,57 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, return 0; } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, - unsigned long size) +static int __init mark_bootmem_node(bootmem_data_t *bdata, + unsigned long start, unsigned long end, + int reserve, int flags) { unsigned long sidx, eidx; - unsigned long i; - BUG_ON(!size); + bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", + bdata - bootmem_node_data, start, end, reserve, flags); - /* out range */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) - return; - /* - * round down end of usable mem, partially free pages are - * considered reserved. - */ + BUG_ON(start < PFN_DOWN(bdata->node_boot_start)); + BUG_ON(end > bdata->node_low_pfn); - if (addr >= bdata->node_boot_start && - PFN_DOWN(addr - bdata->node_boot_start) < bdata->hint_idx) - bdata->hint_idx = PFN_DOWN(addr - bdata->node_boot_start); + sidx = start - PFN_DOWN(bdata->node_boot_start); + eidx = end - PFN_DOWN(bdata->node_boot_start); - /* - * Round up to index to the range. - */ - if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) - sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + if (reserve) + return __reserve(bdata, sidx, eidx, flags); else - sidx = 0; + __free(bdata, sidx, eidx); + return 0; +} + +static int __init mark_bootmem(unsigned long start, unsigned long end, + int reserve, int flags) +{ + unsigned long pos; + bootmem_data_t *bdata; + + pos = start; + list_for_each_entry(bdata, &bdata_list, list) { + int err; + unsigned long max; + + if (pos < PFN_DOWN(bdata->node_boot_start)) { + BUG_ON(pos != start); + continue; + } + + max = min(bdata->node_low_pfn, end); - eidx = PFN_DOWN(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + err = mark_bootmem_node(bdata, pos, max, reserve, flags); + if (reserve && err) { + mark_bootmem(start, pos, 0, 0); + return err; + } - __free(bdata, sidx, eidx); + if (max == end) + return 0; + pos = bdata->node_low_pfn; + } + BUG(); } /** @@ -307,12 +327,17 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, * * Partial pages will be considered reserved and left as they are. * - * Only physical pages that actually reside on @pgdat are marked. + * The range must reside completely on the specified node. */ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - free_bootmem_core(pgdat->bdata, physaddr, size); + unsigned long start, end; + + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); + + mark_bootmem_node(pgdat->bdata, start, end, 0, 0); } /** @@ -322,83 +347,16 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, * * Partial pages will be considered reserved and left as they are. * - * All physical pages within the range are marked, no matter what - * node they reside on. + * The range must be contiguous but may span node boundaries. */ void __init free_bootmem(unsigned long addr, unsigned long size) { - bootmem_data_t *bdata; - list_for_each_entry(bdata, &bdata_list, list) - free_bootmem_core(bdata, addr, size); -} - -/* - * Marks a particular physical memory range as unallocatable. Usable RAM - * might be used for boot-time allocations - or it might get added - * to the free page pool later on. - */ -static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, - unsigned long addr, unsigned long size, int flags) -{ - unsigned long sidx, eidx; - unsigned long i; - - BUG_ON(!size); - - /* out of range, don't hold other */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) - return 0; - - /* - * Round up to index to the range. - */ - if (addr > bdata->node_boot_start) - sidx= PFN_DOWN(addr - bdata->node_boot_start); - else - sidx = 0; - - eidx = PFN_UP(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); - - for (i = sidx; i < eidx; i++) { - if (test_bit(i, bdata->node_bootmem_map)) { - if (flags & BOOTMEM_EXCLUSIVE) - return -EBUSY; - } - } - - return 0; - -} - -static void __init reserve_bootmem_core(bootmem_data_t *bdata, - unsigned long addr, unsigned long size, int flags) -{ - unsigned long sidx, eidx; - unsigned long i; - - BUG_ON(!size); - - /* out of range */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) - return; - - /* - * Round up to index to the range. - */ - if (addr > bdata->node_boot_start) - sidx= PFN_DOWN(addr - bdata->node_boot_start); - else - sidx = 0; + unsigned long start, end; - eidx = PFN_UP(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + start = PFN_UP(addr); + end = PFN_DOWN(addr + size); - return __reserve(bdata, sidx, eidx, flags); + mark_bootmem(start, end, 0, 0); } /** @@ -410,18 +368,17 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, * * Partial pages will be reserved. * - * Only physical pages that actually reside on @pgdat are marked. + * The range must reside completely on the specified node. */ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { - int ret; + unsigned long start, end; - ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); - if (ret < 0) - return -ENOMEM; - reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); - return 0; + start = PFN_DOWN(physaddr); + end = PFN_UP(physaddr + size); + + return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); } #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE @@ -433,24 +390,17 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, * * Partial pages will be reserved. * - * All physical pages within the range are marked, no matter what - * node they reside on. + * The range must be contiguous but may span node boundaries. */ int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { - bootmem_data_t *bdata; - int ret; + unsigned long start, end; - list_for_each_entry(bdata, &bdata_list, list) { - ret = can_reserve_bootmem_core(bdata, addr, size, flags); - if (ret < 0) - return ret; - } - list_for_each_entry(bdata, &bdata_list, list) - reserve_bootmem_core(bdata, addr, size, flags); + start = PFN_DOWN(addr); + end = PFN_UP(addr + size); - return 0; + return mark_bootmem(start, end, 1, flags); } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ @@ -663,7 +613,7 @@ void * __init alloc_bootmem_section(unsigned long size, if (start_nr != section_nr || end_nr != section_nr) { printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", section_nr); - free_bootmem_core(pgdat->bdata, __pa(ptr), size); + free_bootmem_node(pgdat, __pa(ptr), size); ptr = NULL; } -- cgit v1.2.2 From 0f3caba211babef6e3fbde1ba76ddc79321bc92f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:07 -0700 Subject: bootmem: respect goal more likely The old node-agnostic code tried allocating on all nodes starting from the one with the lowest range. alloc_bootmem_core retried without the goal if it could not satisfy it and so the goal was only respected at all when it happened to be on the first (lowest page numbers) node (or theoretically if allocations failed on all nodes before to the one holding the goal). Introduce a non-panicking helper that starts allocating from the node holding the goal and falls back only after all thes tries failed, thus moving the goal fallback code out of alloc_bootmem_core. Make all other allocation functions benefit from this new helper. Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Andi Kleen Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 92 +++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index e5415a5414a5..89646f77b427 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -408,6 +408,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { + unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; BUG_ON(!size); @@ -443,8 +444,11 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, midx = max - PFN_DOWN(bdata->node_boot_start); if (bdata->hint_idx > sidx) { - /* Make sure we retry on failure */ - goal = 1; + /* + * Handle the valid case of sidx being zero and still + * catch the fallback below. + */ + fallback = sidx + 1; sidx = ALIGN(bdata->hint_idx, step); } @@ -492,10 +496,39 @@ find_block: return region; } + if (fallback) { + sidx = ALIGN(fallback - 1, step); + fallback = 0; + goto find_block; + } + + return NULL; +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + bootmem_data_t *bdata; + +restart: + list_for_each_entry(bdata, &bdata_list, list) { + void *region; + + if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) + continue; + if (limit && bdata->node_boot_start >= limit) + break; + + region = alloc_bootmem_core(bdata, size, align, goal, limit); + if (region) + return region; + } + if (goal) { goal = 0; - sidx = 0; - goto find_block; + goto restart; } return NULL; @@ -515,16 +548,23 @@ find_block: * Returns NULL on failure. */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) + unsigned long goal) { - bootmem_data_t *bdata; - void *ptr; + return ___alloc_bootmem_nopanic(size, align, goal, 0); +} - list_for_each_entry(bdata, &bdata_list, list) { - ptr = alloc_bootmem_core(bdata, size, align, goal, 0); - if (ptr) - return ptr; - } +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); return NULL; } @@ -544,16 +584,7 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - void *mem = __alloc_bootmem_nopanic(size,align,goal); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; + return ___alloc_bootmem(size, align, goal, 0); } /** @@ -653,22 +684,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { - bootmem_data_t *bdata; - void *ptr; - - list_for_each_entry(bdata, &bdata_list, list) { - ptr = alloc_bootmem_core(bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - } - - /* - * Whoops, we cannot satisfy the allocation request. - */ - printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); - panic("Out of low memory"); - return NULL; + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } /** -- cgit v1.2.2 From 4cc278b721d5bf3569dfc5f1100253042e097bc3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:08 -0700 Subject: bootmem: Make __alloc_bootmem_low_node fall back to other nodes __alloc_bootmem_node already does this, make the interface consistent. Signed-off-by: Johannes Weiner Cc: Ingo Molnar Cc: Yinghai Lu Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 89646f77b427..459da4710b8f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -587,6 +587,19 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, 0); } +static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *ptr; + + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); + if (ptr) + return ptr; + + return ___alloc_bootmem(size, align, goal, limit); +} + /** * __alloc_bootmem_node - allocate boot memory from a specific node * @pgdat: node to allocate from @@ -605,13 +618,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - - ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - return __alloc_bootmem(size, align, goal); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } #ifdef CONFIG_SPARSEMEM @@ -705,6 +712,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return alloc_bootmem_core(pgdat->bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat->bdata, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); } -- cgit v1.2.2 From 75a56cfe9fdb064d1db1cfbc564315fddb756fb1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:09 -0700 Subject: bootmem: revisit alloc_bootmem_section Since alloc_bootmem_core does no goal-fallback anymore and just returns NULL if the allocation fails, we might now use it in alloc_bootmem_section without all the fixup code for a misplaced allocation. Also, the limit can be the first PFN of the next section as the semantics is that the limit is _above_ the allocated region, not within. Signed-off-by: Johannes Weiner Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 459da4710b8f..282b786c2b15 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -632,30 +632,15 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { - void *ptr; - unsigned long limit, goal, start_nr, end_nr, pfn; - struct pglist_data *pgdat; + bootmem_data_t *bdata; + unsigned long pfn, goal, limit; pfn = section_nr_to_pfn(section_nr); - goal = PFN_PHYS(pfn); - limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; - pgdat = NODE_DATA(early_pfn_to_nid(pfn)); - ptr = alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, - limit); - - if (!ptr) - return NULL; - - start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); - end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); - if (start_nr != section_nr || end_nr != section_nr) { - printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", - section_nr); - free_bootmem_node(pgdat, __pa(ptr), size); - ptr = NULL; - } + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - return ptr; + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); } #endif -- cgit v1.2.2 From 3560e249abda6bee41a07a7bf0383a6e193e2839 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:28:09 -0700 Subject: bootmem: replace node_boot_start in struct bootmem_data Almost all users of this field need a PFN instead of a physical address, so replace node_boot_start with node_min_pfn. [Lee.Schermerhorn@hp.com: fix spurious BUG_ON() in mark_bootmem()] Signed-off-by: Johannes Weiner Cc: Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 282b786c2b15..4af15d0340ad 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -80,7 +80,7 @@ static void __init link_bootmem(bootmem_data_t *bdata) bootmem_data_t *ent; ent = list_entry(iter, bootmem_data_t, list); - if (bdata->node_boot_start < ent->node_boot_start) + if (bdata->node_min_pfn < ent->node_min_pfn) break; } list_add_tail(&bdata->list, iter); @@ -96,7 +96,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, mminit_validate_memmodel_limits(&start, &end); bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); - bdata->node_boot_start = PFN_PHYS(start); + bdata->node_min_pfn = start; bdata->node_low_pfn = end; link_bootmem(bdata); @@ -151,7 +151,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (!bdata->node_bootmem_map) return 0; - start = PFN_DOWN(bdata->node_boot_start); + start = bdata->node_min_pfn; end = bdata->node_low_pfn; /* @@ -167,7 +167,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long *map, idx, vec; map = bdata->node_bootmem_map; - idx = start - PFN_DOWN(bdata->node_boot_start); + idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { @@ -192,7 +192,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } page = virt_to_page(bdata->node_bootmem_map); - pages = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) @@ -231,8 +231,8 @@ static void __init __free(bootmem_data_t *bdata, unsigned long idx; bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, - sidx + PFN_DOWN(bdata->node_boot_start), - eidx + PFN_DOWN(bdata->node_boot_start)); + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn); if (bdata->hint_idx > sidx) bdata->hint_idx = sidx; @@ -250,8 +250,8 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, bdebug("nid=%td start=%lx end=%lx flags=%x\n", bdata - bootmem_node_data, - sidx + PFN_DOWN(bdata->node_boot_start), - eidx + PFN_DOWN(bdata->node_boot_start), + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn, flags); for (idx = sidx; idx < eidx; idx++) @@ -261,7 +261,7 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, return -EBUSY; } bdebug("silent double reserve of PFN %lx\n", - idx + PFN_DOWN(bdata->node_boot_start)); + idx + bdata->node_min_pfn); } return 0; } @@ -275,11 +275,11 @@ static int __init mark_bootmem_node(bootmem_data_t *bdata, bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", bdata - bootmem_node_data, start, end, reserve, flags); - BUG_ON(start < PFN_DOWN(bdata->node_boot_start)); + BUG_ON(start < bdata->node_min_pfn); BUG_ON(end > bdata->node_low_pfn); - sidx = start - PFN_DOWN(bdata->node_boot_start); - eidx = end - PFN_DOWN(bdata->node_boot_start); + sidx = start - bdata->node_min_pfn; + eidx = end - bdata->node_min_pfn; if (reserve) return __reserve(bdata, sidx, eidx, flags); @@ -299,7 +299,8 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, int err; unsigned long max; - if (pos < PFN_DOWN(bdata->node_boot_start)) { + if (pos < bdata->node_min_pfn || + pos >= bdata->node_low_pfn) { BUG_ON(pos != start); continue; } @@ -422,7 +423,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, align, goal, limit); - min = PFN_DOWN(bdata->node_boot_start); + min = bdata->node_min_pfn; max = bdata->node_low_pfn; goal >>= PAGE_SHIFT; @@ -440,8 +441,8 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, else start = ALIGN(min, step); - sidx = start - PFN_DOWN(bdata->node_boot_start); - midx = max - PFN_DOWN(bdata->node_boot_start); + sidx = start - bdata->node_min_pfn;; + midx = max - bdata->node_min_pfn; if (bdata->hint_idx > sidx) { /* @@ -491,7 +492,8 @@ find_block: PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) BUG(); - region = phys_to_virt(bdata->node_boot_start + start_off); + region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + + start_off); memset(region, 0, size); return region; } @@ -518,7 +520,7 @@ restart: if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; - if (limit && bdata->node_boot_start >= limit) + if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) break; region = alloc_bootmem_core(bdata, size, align, goal, limit); -- cgit v1.2.2 From 2be0ffe2b29bd31d3debd0877797892ff2d91f4c Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Wed, 23 Jul 2008 21:28:11 -0700 Subject: mm: add alloc_pages_exact() and free_pages_exact() alloc_pages_exact() is similar to alloc_pages(), except that it allocates the minimum number of pages to fulfill the request. This is useful if you want to allocate a very large buffer that is slightly larger than an even power-of-two number of pages. In that case, alloc_pages() will waste a lot of memory. I have a video driver that wants to allocate a 5MB buffer. alloc_pages() wiill waste 3MB of physically-contiguous memory. Signed-off-by: Timur Tabi Cc: Andi Kleen Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa86671ebbd..8d528d57b403 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1697,6 +1697,59 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/** + * alloc_pages_exact - allocate an exact number physically-contiguous pages. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * This function is similar to alloc_pages(), except that it allocates the + * minimum number of pages to satisfy the request. alloc_pages() can only + * allocate memory in power-of-two pages. + * + * This function is also limited by MAX_ORDER. + * + * Memory allocated by this function must be released by free_pages_exact(). + */ +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +{ + unsigned int order = get_order(size); + unsigned long addr; + + addr = __get_free_pages(gfp_mask, order); + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page(addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + + return (void *)addr; +} +EXPORT_SYMBOL(alloc_pages_exact); + +/** + * free_pages_exact - release memory allocated via alloc_pages_exact() + * @virt: the value returned by alloc_pages_exact. + * @size: size of allocation, same value as passed to alloc_pages_exact(). + * + * Release the memory allocated by a previous call to alloc_pages_exact. + */ +void free_pages_exact(void *virt, size_t size) +{ + unsigned long addr = (unsigned long)virt; + unsigned long end = addr + PAGE_ALIGN(size); + + while (addr < end) { + free_page(addr); + addr += PAGE_SIZE; + } +} +EXPORT_SYMBOL(free_pages_exact); + static unsigned int nr_free_zone_pages(int offset) { struct zoneref *z; -- cgit v1.2.2 From b69a7288ea7bf171328f313f0edae629f50e3bdb Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:28:12 -0700 Subject: mm/page_alloc.c: cleanups This patch contains the following cleanups: - make the following needlessly global variables static: - required_kernelcore - zone_movable_pfn[] - make the following needlessly global functions static: - move_freepages() - move_freepages_block() - setup_pageset() - find_usable_zone_for_movable() - adjust_zone_range_for_zone_movable() - __absent_pages_in_range() - find_min_pfn_for_node() - find_zone_movable_pfns_for_nodes() Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d528d57b403..cd4c41432ef6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve; static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ - unsigned long __initdata required_kernelcore; + static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; - unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -674,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype) +static int move_freepages(struct zone *zone, + struct page *start_page, struct page *end_page, + int migratetype) { struct page *page; unsigned long order; @@ -715,7 +715,8 @@ int move_freepages(struct zone *zone, return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, int migratetype) +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -2652,7 +2653,7 @@ static int zone_batchsize(struct zone *zone) return batch; } -inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; @@ -3099,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, * assumption is made that zones within a node are ordered in monotonic * increasing memory addresses so that the "highest" populated zone is used */ -void __init find_usable_zone_for_movable(void) +static void __init find_usable_zone_for_movable(void) { int zone_index; for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { @@ -3125,7 +3126,7 @@ void __init find_usable_zone_for_movable(void) * highest usable zone for ZONE_MOVABLE. This preserves the assumption that * zones within a node are in order of monotonic increases memory addresses */ -void __meminit adjust_zone_range_for_zone_movable(int nid, +static void __meminit adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -3186,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __meminit __absent_pages_in_range(int nid, +static unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -3723,7 +3724,7 @@ static void __init sort_node_map(void) } /* Find the lowest pfn for a node */ -unsigned long __init find_min_pfn_for_node(int nid) +static unsigned long __init find_min_pfn_for_node(int nid) { int i; unsigned long min_pfn = ULONG_MAX; @@ -3795,7 +3796,7 @@ static unsigned long __init early_calculate_totalpages(void) * memory. When they don't, some nodes will have more kernelcore than * others */ -void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) { int i, nid; unsigned long usable_startpfn; -- cgit v1.2.2 From d92bc318547507a944a22e7ef936793dc0fe167f Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:28:12 -0700 Subject: mm: make register_page_bootmem_info_section() static Make the needlessly global register_page_bootmem_info_section() static. Signed-off-by: Adrian Bunk Acked-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6e26adc08f14..ec85c37dcfb9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page) } -void register_page_bootmem_info_section(unsigned long start_pfn) +static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; struct mem_section *ms; -- cgit v1.2.2 From f84f9504bddeec33a72d64ebe95143d3aaeb3f9b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Wed, 23 Jul 2008 21:28:14 -0700 Subject: mm: remove initialization of static per-cpu variables This was required by some old, no-longer-used gcc on sparc. Signed-off-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 45c9f25a8a3b..dd89234ee51f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -34,9 +34,9 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -493,7 +493,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ #define ACCT_THRESHOLD max(16, NR_CPUS * 2) -static DEFINE_PER_CPU(long, committed_space) = 0; +static DEFINE_PER_CPU(long, committed_space); void vm_acct_memory(long pages) { -- cgit v1.2.2 From 48c906823f3927b981db9f0b03c2e2499977ee93 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Wed, 23 Jul 2008 21:28:15 -0700 Subject: memory hotplug: allocate usemap on the section with pgdat Usemaps are allocated on the section which has pgdat by this. Because usemap size is very small, many other sections usemaps are allocated on only one page. If a section has usemap, it can't be removed until removing other sections. This dependency is not desirable for memory removing. Pgdat has similar feature. When a section has pgdat area, it must be the last section for removing on the node. So, if section A has pgdat and section B has usemap for section A, Both sections can't be removed due to dependency each other. To solve this issue, this patch collects usemap on same section with pgdat as much as possible. If other sections doesn't have any dependency, this section will be able to be removed finally. Signed-off-by: Yasunori Goto Cc: Mel Gorman Cc: Andy Whitcroft Cc: David Miller Cc: Badari Pulavarty Cc: Heiko Carstens Cc: Hiroyuki KAMEZAWA Cc: Tony Breeds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 7a3650923d9a..8ffc08990008 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -269,16 +269,92 @@ static unsigned long *__kmalloc_section_usemap(void) } #endif /* CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_MEMORY_HOTREMOVE +static unsigned long * __init +sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +{ + unsigned long section_nr; + + /* + * A page may contain usemaps for other sections preventing the + * page being freed and making a section unremovable while + * other sections referencing the usemap retmain active. Similarly, + * a pgdat can prevent a section being removed. If section A + * contains a pgdat and section B contains the usemap, both + * sections become inter-dependent. This allocates usemaps + * from the same section as the pgdat where possible to avoid + * this problem. + */ + section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + return alloc_bootmem_section(usemap_size(), section_nr); +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ + unsigned long usemap_snr, pgdat_snr; + static unsigned long old_usemap_snr = NR_MEM_SECTIONS; + static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) + return; + + if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) + /* skip redundant message */ + return; + + old_usemap_snr = usemap_snr; + old_pgdat_snr = pgdat_snr; + + usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); + if (usemap_nid != nid) { + printk(KERN_INFO + "node %d must be removed before remove section %ld\n", + nid, usemap_snr); + return; + } + /* + * There is a circular dependency. + * Some platforms allow un-removable section because they will just + * gather other removable sections for dynamic partitioning. + * Just notify un-removable section's number here. + */ + printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, + pgdat_snr, nid); + printk(KERN_CONT + " have a circular dependency on usemap and pgdat allocations\n"); +} +#else +static unsigned long * __init +sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +{ + return NULL; +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) { unsigned long *usemap; struct mem_section *ms = __nr_to_section(pnum); int nid = sparse_early_nid(ms); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); if (usemap) return usemap; + usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + if (usemap) { + check_usemap_section_nr(nid, usemap); + return usemap; + } + /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ nid = 0; -- cgit v1.2.2 From af370fb8cb3031f20438f246798d5f0d98089f29 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Wed, 23 Jul 2008 21:28:17 -0700 Subject: memory hotplug: small fixes to bootmem freeing for memory hotremove - Change some naming * Magic -> types * MIX_INFO -> MIX_SECTION_INFO * Change definition of bootmem type from direct hex value - __free_pages_bootmem() becomes __meminit. Signed-off-by: Yasunori Goto Cc: Andy Whitcroft Cc: Badari Pulavarty Cc: Yinghai Lu Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 12 ++++++------ mm/page_alloc.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ec85c37dcfb9..0fb05b258f0c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int magic) +static void get_page_bootmem(unsigned long info, struct page *page, int type) { - atomic_set(&page->_mapcount, magic); + atomic_set(&page->_mapcount, type); SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic) void put_page_bootmem(struct page *page) { - int magic; + int type; - magic = atomic_read(&page->_mapcount); - BUG_ON(magic >= -1); + type = atomic_read(&page->_mapcount); + BUG_ON(type >= -1); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); @@ -119,7 +119,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_INFO); + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cd4c41432ef6..6da667274df5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -533,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) /* * permit the bootmem allocator to evade page validation on high-order frees */ -void __free_pages_bootmem(struct page *page, unsigned int order) +void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { if (order == 0) { __ClearPageReserved(page); -- cgit v1.2.2 From 2f7f24eca31c4fc2fdb134b2ef743ccd67cfb9a9 Mon Sep 17 00:00:00 2001 From: Kent Liu Date: Wed, 23 Jul 2008 21:28:18 -0700 Subject: memory-hotplug: don't calculate vm_total_pages twice when rebuilding zonelists in online_pages() If zonelist is required to be rebuilt in online_pages(), there is no need to recalculate vm_total_pages in that function, as it has been updated in the call build_all_zonelists(). Signed-off-by: Kent Liu Acked-by: KAMEZAWA Hiroyuki Cc: Yasunori Goto Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0fb05b258f0c..93aba78dc8b6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (need_zonelists_rebuild) build_all_zonelists(); - vm_total_pages = nr_free_pagecache_pages(); + else + vm_total_pages = nr_free_pagecache_pages(); + writeback_set_ratelimit(); if (onlined_pages) -- cgit v1.2.2 From 5c755e9fd813810680abd56ec09a5f90143e815b Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Wed, 23 Jul 2008 21:28:19 -0700 Subject: memory-hotplug: add sysfs removable attribute for hotplug memory remove Memory may be hot-removed on a per-memory-block basis, particularly on POWER where the SPARSEMEM section size often matches the memory-block size. A user-level agent must be able to identify which sections of memory are likely to be removable before attempting the potentially expensive operation. This patch adds a file called "removable" to the memory directory in sysfs to help such an agent. In this patch, a memory block is considered removable if; o It contains only MOVABLE pageblocks o It contains only pageblocks with free pages regardless of pageblock type On the other hand, a memory block starting with a PageReserved() page will never be considered removable. Without this patch, the user-agent is forced to choose a memory block to remove randomly. Sample output of the sysfs files: ./memory/memory0/removable: 0 ./memory/memory1/removable: 0 ./memory/memory2/removable: 0 ./memory/memory3/removable: 0 ./memory/memory4/removable: 0 ./memory/memory5/removable: 0 ./memory/memory6/removable: 0 ./memory/memory7/removable: 1 ./memory/memory8/removable: 0 ./memory/memory9/removable: 0 ./memory/memory10/removable: 0 ./memory/memory11/removable: 0 ./memory/memory12/removable: 0 ./memory/memory13/removable: 0 ./memory/memory14/removable: 0 ./memory/memory15/removable: 0 ./memory/memory16/removable: 0 ./memory/memory17/removable: 1 ./memory/memory18/removable: 1 ./memory/memory19/removable: 1 ./memory/memory20/removable: 1 ./memory/memory21/removable: 1 ./memory/memory22/removable: 1 Signed-off-by: Badari Pulavarty Signed-off-by: Mel Gorman Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 93aba78dc8b6..89fee2dcb039 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -522,6 +522,66 @@ error: EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE +/* + * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy + * set and the size of the free page is given by page_order(). Using this, + * the function determines if the pageblock contains only free pages. + * Due to buddy contraints, a free page at least the size of a pageblock will + * be located at the start of the pageblock + */ +static inline int pageblock_free(struct page *page) +{ + return PageBuddy(page) && page_order(page) >= pageblock_order; +} + +/* Return the start of the next active pageblock after a given page */ +static struct page *next_active_pageblock(struct page *page) +{ + int pageblocks_stride; + + /* Ensure the starting page is pageblock-aligned */ + BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + + /* Move forward by at least 1 * pageblock_nr_pages */ + pageblocks_stride = 1; + + /* If the entire pageblock is free, move to the end of free page */ + if (pageblock_free(page)) + pageblocks_stride += page_order(page) - pageblock_order; + + return page + (pageblocks_stride * pageblock_nr_pages); +} + +/* Checks if this range of memory is likely to be hot-removable. */ +int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +{ + int type; + struct page *page = pfn_to_page(start_pfn); + struct page *end_page = page + nr_pages; + + /* Check the starting page of each pageblock within the range */ + for (; page < end_page; page = next_active_pageblock(page)) { + type = get_pageblock_migratetype(page); + + /* + * A pageblock containing MOVABLE or free pages is considered + * removable + */ + if (type != MIGRATE_MOVABLE && !pageblock_free(page)) + return 0; + + /* + * A pageblock starting with a PageReserved page is not + * considered removable. + */ + if (PageReserved(page)) + return 0; + } + + /* All pageblocks in the memory block are likely to be hot-removable */ + return 1; +} + /* * Confirm all pages in a range [start, end) is belongs to the same zone. */ -- cgit v1.2.2 From 83d1674a946141c3c59d430e96c224f7937e6158 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Wed, 23 Jul 2008 21:28:22 -0700 Subject: mm: make CONFIG_MIGRATION available w/o CONFIG_NUMA We'd like to support CONFIG_MEMORY_HOTREMOVE on s390, which depends on CONFIG_MIGRATION. So far, CONFIG_MIGRATION is only available with NUMA support. This patch makes CONFIG_MIGRATION selectable for architectures that define ARCH_ENABLE_MEMORY_HOTREMOVE. When MIGRATION is enabled w/o NUMA, the kernel won't compile because migrate_vmas() does not know about vm_ops->migrate() and vma_migratable() does not know about policy_zone. To fix this, those two functions can be restricted to '#ifdef CONFIG_NUMA' because they are not being used w/o NUMA. vma_migratable() is moved over from migrate.h to mempolicy.h. [kosaki.motohiro@jp.fujitsu.com: build fix] Acked-by: Christoph Lameter Signed-off-by: Gerald Schaefer Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: KOSAKI Motorhiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- mm/migrate.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c4de85285bb4..aa799007a11b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS config MIGRATION bool "Page migration" def_bool y - depends on NUMA + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for diff --git a/mm/migrate.c b/mm/migrate.c index e7d13a708da0..376cceba82f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1071,7 +1071,6 @@ out2: mmput(mm); return err; } -#endif /* * Call migration functions in the vma_ops that may prepare @@ -1093,3 +1092,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } +#endif -- cgit v1.2.2 From 78ecba081224a2db5876b6b81cfed0b78f58adc7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2008 21:28:23 -0700 Subject: mm: fix ever-decreasing swap priority Vegard Nossum has noticed the ever-decreasing negative priority in a swapon /swapoff loop, which eventually would misprioritize when int wraps positive. Not worth spending much code on, but probably better fixed. It's easy to handle the swapping on and off of just one area, but there's not much point if a pair or more still misbehave. To handle the general case, swapoff should compact negative priorities, keeping them always from -1 to -MAX_SWAPFILES. That's a change, but should cause no regression, since these negative (unspecified) priorities are disjoint from the the positive specified priorities 0 to 32767. One small functional difference, which seems appropriate: when swapoff fails to free all swap from a negative priority area, that area is now reinserted at lowest priority, rather than at its original priority. In moving down swapon's setting of priority, I notice that an area is visible to /proc/swaps when it has swap_map set, yet that was being set before all the visible fields were properly filled in: corrected. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Reported-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index bd1bb5920306..2f33edb8bee9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@ DEFINE_SPINLOCK(swap_lock); unsigned int nr_swapfiles; long total_swap_pages; static int swap_overflow; +static int least_priority; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile) /* just pick something that's safe... */ swap_list.next = swap_list.head; } + if (p->prio < 0) { + for (i = p->next; i >= 0; i = swap_info[i].next) + swap_info[i].prio = p->prio--; + least_priority++; + } nr_swap_pages -= p->pages; total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile) if (err) { /* re-insert swap space back into swap_list */ spin_lock(&swap_lock); - for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio < 0) + p->prio = --least_priority; + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { if (p->prio >= swap_info[i].prio) break; + prev = i; + } p->next = i; if (prev < 0) swap_list.head = swap_list.next = p - swap_info; @@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) unsigned int type; int i, prev; int error; - static int least_priority; union swap_header *swap_header = NULL; int swap_header_version; unsigned int nr_good_pages = 0; @@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) sector_t span; unsigned long maxpages = 1; int swapfilesize; - unsigned short *swap_map; + unsigned short *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; @@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) } if (type >= nr_swapfiles) nr_swapfiles = type+1; + memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->extent_list); p->flags = SWP_USED; - p->swap_file = NULL; - p->old_block_size = 0; - p->swap_map = NULL; - p->lowest_bit = 0; - p->highest_bit = 0; - p->cluster_nr = 0; - p->inuse_pages = 0; p->next = -1; - if (swap_flags & SWAP_FLAG_PREFER) { - p->prio = - (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; - } else { - p->prio = --least_priority; - } spin_unlock(&swap_lock); name = getname(specialfile); error = PTR_ERR(name); @@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { error = -ENOMEM; goto bad_swap; } error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); + memset(swap_map, 0, maxpages * sizeof(short)); for (i = 0; i < swap_header->info.nr_badpages; i++) { int page_nr = swap_header->info.badpages[i]; if (page_nr <= 0 || page_nr >= swap_header->info.last_page) error = -EINVAL; else - p->swap_map[page_nr] = SWAP_MAP_BAD; + swap_map[page_nr] = SWAP_MAP_BAD; } nr_good_pages = swap_header->info.last_page - swap_header->info.nr_badpages - @@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) } if (nr_good_pages) { - p->swap_map[0] = SWAP_MAP_BAD; + swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, &span); @@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) mutex_lock(&swapon_mutex); spin_lock(&swap_lock); + if (swap_flags & SWAP_FLAG_PREFER) + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + else + p->prio = --least_priority; + p->swap_map = swap_map; p->flags = SWP_ACTIVE; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; @@ -1707,12 +1712,8 @@ bad_swap: destroy_swap_extents(p); bad_swap_2: spin_lock(&swap_lock); - swap_map = p->swap_map; p->swap_file = NULL; - p->swap_map = NULL; p->flags = 0; - if (!(swap_flags & SWAP_FLAG_PREFER)) - ++least_priority; spin_unlock(&swap_lock); vfree(swap_map); if (swap_file) -- cgit v1.2.2 From 2b4bc46052ea8cd7c370b67ca0b9c26586f1439a Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 25 Jul 2008 01:45:42 -0700 Subject: pdflush: use time_after() instead of open-coding it Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pdflush.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index 9d834aa4b979..0cbe0c60c6bf 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work) * Thread creation: For how long have there been zero * available threads? */ - if (jiffies - last_empty_jifs > 1 * HZ) { + if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { /* unlocked list_empty() test is OK here */ if (list_empty(&pdflush_list)) { /* unlocked test is OK here */ @@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work) if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { + if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { /* Limit exit rate */ pdf->when_i_went_to_sleep = jiffies; break; /* exeunt */ -- cgit v1.2.2 From 3f31fddfa26b7594b44ff2b34f9a04ba409e0f91 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Fri, 25 Jul 2008 01:46:22 -0700 Subject: jbd: fix race between free buffer and commit transaction journal_try_to_free_buffers() could race with jbd commit transaction when the later is holding the buffer reference while waiting for the data buffer to flush to disk. If the caller of journal_try_to_free_buffers() request tries hard to release the buffers, it will treat the failure as error and return back to the caller. We have seen the directo IO failed due to this race. Some of the caller of releasepage() also expecting the buffer to be dropped when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers(). With this patch, if the caller is passing the __GFP_WAIT and __GFP_FS to indicating this call could wait, in case of try_to_free_buffers() failed, let's waiting for journal_commit_transaction() to finish commit the current committing transaction, then try to free those buffers again. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mingming Cao Reviewed-by: Badari Pulavarty Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 7675b91f4f63..5d4c880d7cd9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2563,9 +2563,8 @@ EXPORT_SYMBOL(generic_file_aio_write); * Otherwise return zero. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * - * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { -- cgit v1.2.2 From 856c13aa1ff6136c1968414fdea5938ea9d5ebf2 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:47:04 -0700 Subject: cgroup files: convert res_counter_write() to be a cgroups write_string() handler Currently res_counter_write() is a raw file handler even though it's ultimately taking a number, since in some cases it wants to pre-process the string when converting it to a number. This patch converts res_counter_write() from a raw file handler to a write_string() handler; this allows some of the boilerplate copying/locking/checking to be removed, and simplies the cleanup path, since these functions are now performed by the cgroups framework. [lizf@cn.fujitsu.com: build fix] Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Cc: Balbir Singh Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e46451e1d9b7..7385d58fb061 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -838,32 +838,18 @@ out: return ret; } -static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) -{ - *tmp = memparse(buf, &buf); - if (*buf != '\0') - return -EINVAL; - - /* - * Round up the value to the closest page size - */ - *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; - return 0; -} - static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, cft->private); } -static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) { return res_counter_write(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - mem_cgroup_write_strategy); + cft->private, buffer, + res_counter_memparse_write_strategy); } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) @@ -940,7 +926,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, - .write = mem_cgroup_write, + .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, { -- cgit v1.2.2 From a181b0e888a1d917edcab57cd73ccf7d8e75a46c Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:08 -0700 Subject: memcg: make global var read_mostly mem_cgroup_subsys and page_cgroup_cache should be read_mostly and MEM_CGROUP_RECLAIM_RETRIES can be just a fixed number. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7385d58fb061..c52c045f5152 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,9 +35,9 @@ #include -struct cgroup_subsys mem_cgroup_subsys; -static const int MEM_CGROUP_RECLAIM_RETRIES = 5; -static struct kmem_cache *page_cgroup_cache; +struct cgroup_subsys mem_cgroup_subsys __read_mostly; +static struct kmem_cache *page_cgroup_cache __read_mostly; +#define MEM_CGROUP_RECLAIM_RETRIES 5 /* * Statistics for memory cgroup. -- cgit v1.2.2 From 508b7be0a5b06b64203512ed9b34191cddc83f56 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:09 -0700 Subject: memcg: avoid unnecessary initialization * remove over-killing initialization (in fast path) * makeing the condition for PAGE_CGROUP_FLAG_ACTIVE be more obvious. Signed-off-by: KAMEAZAWA Hiroyuki Reviewed-by: Li Zefan Acked-by: Balbir Singh Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c52c045f5152..90ccc1326356 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); - list_del_init(&pc->lru); + list_del(&pc->lru); } static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, @@ -559,7 +559,7 @@ retry: } unlock_page_cgroup(page); - pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); + pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (pc == NULL) goto err; @@ -606,9 +606,14 @@ retry: pc->ref_cnt = 1; pc->mem_cgroup = mem; pc->page = page; - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) pc->flags = PAGE_CGROUP_FLAG_CACHE; + else + pc->flags = PAGE_CGROUP_FLAG_ACTIVE; lock_page_cgroup(page); if (page_get_page_cgroup(page)) { -- cgit v1.2.2 From e8589cc189f96b87348ae83ea4db38eaac624135 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:10 -0700 Subject: memcg: better migration handling This patch changes page migration under memory controller to use a different algorithm. (thanks to Christoph for new idea.) Before: - page_cgroup is migrated from an old page to a new page. After: - a new page is accounted , no reuse of page_cgroup. Pros: - We can avoid compliated lock depndencies and races in migration. Cons: - new param to mem_cgroup_charge_common(). - mem_cgroup_getref() is added for handling ref_cnt ping-pong. This version simplifies complicated lock dependency in page migraiton under memory resource controller. new refcnt sequence is following. a mapped page: prepage_migration() ..... +1 to NEW page try_to_unmap() ..... all refs to OLD page is gone. move_pages() ..... +1 to NEW page if page cache. remap... ..... all refs from *map* is added to NEW one. end_migration() ..... -1 to New page. page's mapcount + (page_is_cache) refs are added to NEW one. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: YAMAMOTO Takashi Cc: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 128 ++++++++++++++++++++++++++++---------------------------- mm/migrate.c | 22 ++++++---- 2 files changed, 80 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 90ccc1326356..da5912b84551 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -524,7 +524,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) + gfp_t gfp_mask, enum charge_type ctype, + struct mem_cgroup *memcg) { struct mem_cgroup *mem; struct page_cgroup *pc; @@ -569,16 +570,21 @@ retry: * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!mm) - mm = &init_mm; + if (!memcg) { + if (!mm) + mm = &init_mm; - rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - /* - * For every charge from the cgroup, increment reference count - */ - css_get(&mem->css); - rcu_read_unlock(); + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + /* + * For every charge from the cgroup, increment reference count + */ + css_get(&mem->css); + rcu_read_unlock(); + } else { + mem = memcg; + css_get(&memcg->css); + } while (res_counter_charge(&mem->res, PAGE_SIZE)) { if (!(gfp_mask & __GFP_WAIT)) @@ -648,7 +654,7 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, @@ -657,7 +663,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!mm) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); +} + +int mem_cgroup_getref(struct page *page) +{ + struct page_cgroup *pc; + + if (mem_cgroup_subsys.disabled) + return 0; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + VM_BUG_ON(!pc); + pc->ref_cnt++; + unlock_page_cgroup(page); + return 0; } /* @@ -707,65 +728,39 @@ unlock: } /* - * Returns non-zero if a page (under migration) has valid page_cgroup member. - * Refcnt of page_cgroup is incremented. + * Before starting migration, account against new page. */ -int mem_cgroup_prepare_migration(struct page *page) +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; + struct mem_cgroup *mem = NULL; + enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + int ret = 0; if (mem_cgroup_subsys.disabled) return 0; lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (pc) - pc->ref_cnt++; + if (pc) { + mem = pc->mem_cgroup; + css_get(&mem->css); + if (pc->flags & PAGE_CGROUP_FLAG_CACHE) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + } unlock_page_cgroup(page); - return pc != NULL; -} - -void mem_cgroup_end_migration(struct page *page) -{ - mem_cgroup_uncharge_page(page); + if (mem) { + ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, + ctype, mem); + css_put(&mem->css); + } + return ret; } -/* - * We know both *page* and *newpage* are now not-on-LRU and PG_locked. - * And no race with uncharge() routines because page_cgroup for *page* - * has extra one reference by mem_cgroup_prepare_migration. - */ -void mem_cgroup_page_migration(struct page *page, struct page *newpage) +/* remove redundant charge */ +void mem_cgroup_end_migration(struct page *newpage) { - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - unsigned long flags; - - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (!pc) { - unlock_page_cgroup(page); - return; - } - - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); - - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - - pc->page = newpage; - lock_page_cgroup(newpage); - page_assign_page_cgroup(newpage, pc); - - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); - - unlock_page_cgroup(newpage); + mem_cgroup_uncharge_page(newpage); } /* @@ -795,12 +790,19 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, page = pc->page; get_page(page); spin_unlock_irqrestore(&mz->lru_lock, flags); - mem_cgroup_uncharge_page(page); - put_page(page); - if (--count <= 0) { - count = FORCE_UNCHARGE_BATCH; + /* + * Check if this page is on LRU. !LRU page can be found + * if it's under page migration. + */ + if (PageLRU(page)) { + mem_cgroup_uncharge_page(page); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); + } + } else cond_resched(); - } spin_lock_irqsave(&mz->lru_lock, flags); } spin_unlock_irqrestore(&mz->lru_lock, flags); diff --git a/mm/migrate.c b/mm/migrate.c index 376cceba82f9..f6d7f8efd1a8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -358,6 +358,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, __inc_zone_page_state(newpage, NR_FILE_PAGES); write_unlock_irq(&mapping->tree_lock); + if (!PageSwapCache(newpage)) { + mem_cgroup_uncharge_page(page); + mem_cgroup_getref(newpage); + } return 0; } @@ -611,7 +615,6 @@ static int move_to_new_page(struct page *newpage, struct page *page) rc = fallback_migrate_page(mapping, newpage, page); if (!rc) { - mem_cgroup_page_migration(page, newpage); remove_migration_ptes(page, newpage); } else newpage->mapping = NULL; @@ -641,6 +644,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; + charge = mem_cgroup_prepare_migration(page, newpage); + if (charge == -ENOMEM) { + rc = -ENOMEM; + goto move_newpage; + } + /* prepare cgroup just returns 0 or -ENOMEM */ + BUG_ON(charge); + rc = -EAGAIN; if (TestSetPageLocked(page)) { if (!force) @@ -692,19 +703,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, goto rcu_unlock; } - charge = mem_cgroup_prepare_migration(page); /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); if (!page_mapped(page)) rc = move_to_new_page(newpage, page); - if (rc) { + if (rc) remove_migration_ptes(page, page); - if (charge) - mem_cgroup_end_migration(page); - } else if (charge) - mem_cgroup_end_migration(newpage); rcu_unlock: if (rcu_locked) rcu_read_unlock(); @@ -725,6 +731,8 @@ unlock: } move_newpage: + if (!charge) + mem_cgroup_end_migration(newpage); /* * Move the new page to the LRU. If migration was not successful * then this will free the page. -- cgit v1.2.2 From 69029cd550284e32de13d6dd2f77b723c8a0e444 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:14 -0700 Subject: memcg: remove refcnt from page_cgroup memcg: performance improvements Patch Description 1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed) 2/5 ... swapcache handling patch 3/5 ... add helper function for shmem's memory reclaim patch 4/5 ... optimize by likely/unlikely ppatch 5/5 ... remove redundunt check patch (shmem handling is fixed.) Unix bench result. == 2.6.26-rc2-mm1 + memory resource controller Execl Throughput 2915.4 lps (29.6 secs, 3 samples) C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 2915.4 678.0 File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0 File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8 Shell Scripts (8 concurrent) 6.0 1097.7 1829.5 ========= FINAL SCORE 991.3 == 2.6.26-rc2-mm1 + this set == Execl Throughput 3012.9 lps (29.9 secs, 3 samples) C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 3012.9 700.7 File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7 File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0 Shell Scripts (8 concurrent) 6.0 1120.3 1867.2 ========= FINAL SCORE 1004.6 This patch: Remove refcnt from page_cgroup(). After this, * A page is charged only when !page_mapped() && no page_cgroup is assigned. * Anon page is newly mapped. * File page is added to mapping->tree. * A page is uncharged only when * Anon page is fully unmapped. * File page is removed from LRU. There is no change in behavior from user's view. This patch also removes unnecessary calls in rmap.c which was used only for refcnt mangement. [akpm@linux-foundation.org: fix warning] [hugh@veritas.com: fix shmem_unuse_inode charging] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++-- mm/memcontrol.c | 109 ++++++++++++++++++++++++++++++++------------------------ mm/migrate.c | 3 +- mm/rmap.c | 14 +------- mm/shmem.c | 35 ++++++++++++------ 5 files changed, 92 insertions(+), 75 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 5d4c880d7cd9..2d3ec1ffc66e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -115,7 +115,7 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; @@ -474,12 +474,12 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); } else - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); out: return error; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da5912b84551..a61706193c31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -166,7 +166,6 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ struct page *page; struct mem_cgroup *mem_cgroup; - int ref_cnt; /* cached, mapped, migrating */ int flags; }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ @@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc) enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ }; /* @@ -552,9 +552,7 @@ retry: */ if (pc) { VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - - pc->ref_cnt++; + VM_BUG_ON(!pc->mem_cgroup); unlock_page_cgroup(page); goto done; } @@ -570,10 +568,7 @@ retry: * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!memcg) { - if (!mm) - mm = &init_mm; - + if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); /* @@ -609,7 +604,6 @@ retry: } } - pc->ref_cnt = 1; pc->mem_cgroup = mem; pc->page = page; /* @@ -653,6 +647,17 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + /* + * If already mapped, we don't have to account. + * If page cache, page->mapping has address_space. + * But page->mapping may have out-of-use anon_vma pointer, + * detecit it by PageAnon() check. newly-mapped-anon's page->mapping + * is NULL. + */ + if (page_mapped(page) || (page->mapping && !PageAnon(page))) + return 0; + if (unlikely(!mm)) + mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } @@ -660,32 +665,17 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - if (!mm) + if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); } -int mem_cgroup_getref(struct page *page) -{ - struct page_cgroup *pc; - - if (mem_cgroup_subsys.disabled) - return 0; - - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - VM_BUG_ON(!pc); - pc->ref_cnt++; - unlock_page_cgroup(page); - return 0; -} - /* - * Uncharging is always a welcome operation, we never complain, simply - * uncharge. + * uncharge if !page_mapped(page) */ -void mem_cgroup_uncharge_page(struct page *page) +static void +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem; @@ -704,29 +694,41 @@ void mem_cgroup_uncharge_page(struct page *page) goto unlock; VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - if (--(pc->ref_cnt) == 0) { - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) + || page_mapped(page))) + goto unlock; - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_remove_list(mz, pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); - mem = pc->mem_cgroup; - res_counter_uncharge(&mem->res, PAGE_SIZE); - css_put(&mem->css); + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); - kmem_cache_free(page_cgroup_cache, pc); - return; - } + mem = pc->mem_cgroup; + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); + kmem_cache_free(page_cgroup_cache, pc); + return; unlock: unlock_page_cgroup(page); } +void mem_cgroup_uncharge_page(struct page *page) +{ + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); +} + +void mem_cgroup_uncharge_cache_page(struct page *page) +{ + VM_BUG_ON(page_mapped(page)); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); +} + /* * Before starting migration, account against new page. */ @@ -757,15 +759,29 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) return ret; } -/* remove redundant charge */ +/* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct page *newpage) { - mem_cgroup_uncharge_page(newpage); + /* + * At success, page->mapping is not NULL. + * special rollback care is necessary when + * 1. at migration failure. (newpage->mapping is cleared in this case) + * 2. the newpage was moved but not remapped again because the task + * exits and the newpage is obsolete. In this case, the new page + * may be a swapcache. So, we just call mem_cgroup_uncharge_page() + * always for avoiding mess. The page_cgroup will be removed if + * unnecessary. File cache pages is still on radix-tree. Don't + * care it. + */ + if (!newpage->mapping) + __mem_cgroup_uncharge_common(newpage, + MEM_CGROUP_CHARGE_TYPE_FORCE); + else if (PageAnon(newpage)) + mem_cgroup_uncharge_page(newpage); } /* * This routine traverse page_cgroup in given list and drop them all. - * This routine ignores page_cgroup->ref_cnt. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ #define FORCE_UNCHARGE_BATCH (128) @@ -795,7 +811,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, * if it's under page migration. */ if (PageLRU(page)) { - mem_cgroup_uncharge_page(page); + __mem_cgroup_uncharge_common(page, + MEM_CGROUP_CHARGE_TYPE_FORCE); put_page(page); if (--count <= 0) { count = FORCE_UNCHARGE_BATCH; diff --git a/mm/migrate.c b/mm/migrate.c index f6d7f8efd1a8..d8c65a65c61d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -359,8 +359,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, write_unlock_irq(&mapping->tree_lock); if (!PageSwapCache(newpage)) { - mem_cgroup_uncharge_page(page); - mem_cgroup_getref(newpage); + mem_cgroup_uncharge_cache_page(page); } return 0; diff --git a/mm/rmap.c b/mm/rmap.c index bf0a5b7cfb8e..abbd29f7c43f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -576,14 +576,8 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); if (atomic_inc_and_test(&page->_mapcount)) __page_set_anon_rmap(page, vma, address); - else { + else __page_check_anon_rmap(page, vma, address); - /* - * We unconditionally charged during prepare, we uncharge here - * This takes care of balancing the reference counts - */ - mem_cgroup_uncharge_page(page); - } } /** @@ -614,12 +608,6 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) __inc_zone_page_state(page, NR_FILE_MAPPED); - else - /* - * We unconditionally charged during prepare, we uncharge here - * This takes care of balancing the reference counts - */ - mem_cgroup_uncharge_page(page); } #ifdef CONFIG_DEBUG_VM diff --git a/mm/shmem.c b/mm/shmem.c index 9ffbea9b79e1..d58305e8a484 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -922,20 +922,26 @@ found: error = 1; if (!inode) goto out; - /* Precharge page while we can wait, compensate afterwards */ + /* Precharge page using GFP_KERNEL while we can wait */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) goto out; error = radix_tree_preload(GFP_KERNEL); - if (error) - goto uncharge; + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto out; + } error = 1; spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); - if (ptr && ptr->val == entry.val) + if (ptr && ptr->val == entry.val) { error = add_to_page_cache(page, inode->i_mapping, idx, GFP_NOWAIT); + /* does mem_cgroup_uncharge_cache_page on error */ + } else /* we must compensate for our precharge above */ + mem_cgroup_uncharge_cache_page(page); + if (error == -EEXIST) { struct page *filepage = find_get_page(inode->i_mapping, idx); error = 1; @@ -961,8 +967,6 @@ found: shmem_swp_unmap(ptr); spin_unlock(&info->lock); radix_tree_preload_end(); -uncharge: - mem_cgroup_uncharge_page(page); out: unlock_page(page); page_cache_release(page); @@ -1319,7 +1323,7 @@ repeat: page_cache_release(swappage); goto failed; } - mem_cgroup_uncharge_page(swappage); + mem_cgroup_uncharge_cache_page(swappage); } page_cache_release(swappage); goto repeat; @@ -1358,6 +1362,8 @@ repeat: } if (!filepage) { + int ret; + spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { @@ -1386,10 +1392,18 @@ repeat: swap = *entry; shmem_swp_unmap(entry); } - if (error || swap.val || 0 != add_to_page_cache_lru( - filepage, mapping, idx, GFP_NOWAIT)) { + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(filepage); + else + ret = add_to_page_cache_lru(filepage, mapping, + idx, GFP_NOWAIT); + /* + * At add_to_page_cache_lru() failure, uncharge will + * be done automatically. + */ + if (ret) { spin_unlock(&info->lock); - mem_cgroup_uncharge_page(filepage); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1398,7 +1412,6 @@ repeat: goto failed; goto repeat; } - mem_cgroup_uncharge_page(filepage); info->flags |= SHMEM_PAGEIN; } -- cgit v1.2.2 From c9b0ed51483cc2fc42bb801b6675c4231b0e4634 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:15 -0700 Subject: memcg: helper function for relcaim from shmem. A new call, mem_cgroup_shrink_usage() is added for shmem handling and relacing non-standard usage of mem_cgroup_charge/uncharge. Now, shmem calls mem_cgroup_charge() just for reclaim some pages from mem_cgroup. In general, shmem is used by some process group and not for global resource (like file caches). So, it's reasonable to reclaim pages from mem_cgroup where shmem is mainly used. [hugh@veritas.com: shmem_getpage release page sooner] [hugh@veritas.com: mem_cgroup_shrink_usage css_put] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++++++++++++ mm/shmem.c | 11 ++++------- 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a61706193c31..f46b8615de6c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -780,6 +780,32 @@ void mem_cgroup_end_migration(struct page *newpage) mem_cgroup_uncharge_page(newpage); } +/* + * A call to try to shrink memory usage under specified resource controller. + * This is typically used for page reclaiming for shmem for reducing side + * effect of page allocation from shmem, which is used by some mem_cgroup. + */ +int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) +{ + struct mem_cgroup *mem; + int progress = 0; + int retry = MEM_CGROUP_RECLAIM_RETRIES; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + css_get(&mem->css); + rcu_read_unlock(); + + do { + progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + } while (!progress && --retry); + + css_put(&mem->css); + if (!retry) + return -ENOMEM; + return 0; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. diff --git a/mm/shmem.c b/mm/shmem.c index d58305e8a484..f92fea94d037 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1315,17 +1315,14 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); + page_cache_release(swappage); if (error == -ENOMEM) { /* allow reclaim from this memory cgroup */ - error = mem_cgroup_cache_charge(swappage, - current->mm, gfp & ~__GFP_HIGHMEM); - if (error) { - page_cache_release(swappage); + error = mem_cgroup_shrink_usage(current->mm, + gfp); + if (error) goto failed; - } - mem_cgroup_uncharge_cache_page(swappage); } - page_cache_release(swappage); goto repeat; } } else if (sgp == SGP_READ && !filepage) { -- cgit v1.2.2 From b76734e5e34e1889ab9fc5f3756570b1129f0f50 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:16 -0700 Subject: memcg: add hints for branch Showing brach direction for obvious conditions. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f46b8615de6c..04ded27f6226 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -550,7 +550,7 @@ retry: * The page_cgroup exists and * the page has already been accounted. */ - if (pc) { + if (unlikely(pc)) { VM_BUG_ON(pc->page != page); VM_BUG_ON(!pc->mem_cgroup); unlock_page_cgroup(page); @@ -559,7 +559,7 @@ retry: unlock_page_cgroup(page); pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); - if (pc == NULL) + if (unlikely(pc == NULL)) goto err; /* @@ -616,7 +616,7 @@ retry: pc->flags = PAGE_CGROUP_FLAG_ACTIVE; lock_page_cgroup(page); - if (page_get_page_cgroup(page)) { + if (unlikely(page_get_page_cgroup(page))) { unlock_page_cgroup(page); /* * Another charge has been added to this page already. @@ -690,7 +690,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) */ lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (!pc) + if (unlikely(!pc)) goto unlock; VM_BUG_ON(pc->page != page); -- cgit v1.2.2 From accf163e6ab729f1fc5fffaa0310e498270bf4e7 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:17 -0700 Subject: memcg: remove a redundant check Because of remove refcnt patch, it's very rare case to that mem_cgroup_charge_common() is called against a page which is accounted. mem_cgroup_charge_common() is called when. 1. a page is added into file cache. 2. an anon page is _newly_ mapped. A racy case is that a newly-swapped-in anonymous page is referred from prural threads in do_swap_page() at the same time. (a page is not Locked when mem_cgroup_charge() is called from do_swap_page.) Another case is shmem. It charges its page before calling add_to_page_cache(). Then, mem_cgroup_charge_cache() is called twice. This case is handled in mem_cgroup_cache_charge(). But this check may be too hacky... Signed-off-by : KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 53 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04ded27f6226..5b3759bd5494 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -536,28 +536,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (mem_cgroup_subsys.disabled) return 0; - /* - * Should page_cgroup's go to their own slab? - * One could optimize the performance of the charging routine - * by saving a bit in the page_flags and using it as a lock - * to see if the cgroup page already has a page_cgroup associated - * with it - */ -retry: - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - /* - * The page_cgroup exists and - * the page has already been accounted. - */ - if (unlikely(pc)) { - VM_BUG_ON(pc->page != page); - VM_BUG_ON(!pc->mem_cgroup); - unlock_page_cgroup(page); - goto done; - } - unlock_page_cgroup(page); - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (unlikely(pc == NULL)) goto err; @@ -618,15 +596,10 @@ retry: lock_page_cgroup(page); if (unlikely(page_get_page_cgroup(page))) { unlock_page_cgroup(page); - /* - * Another charge has been added to this page already. - * We take lock_page_cgroup(page) again and read - * page->cgroup, increment refcnt.... just retry is OK. - */ res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); kmem_cache_free(page_cgroup_cache, pc); - goto retry; + goto done; } page_assign_page_cgroup(page, pc); @@ -665,8 +638,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + /* + * Corner case handling. This is called from add_to_page_cache() + * in usual. But some FS (shmem) precharges this page before calling it + * and call add_to_page_cache() with GFP_NOWAIT. + * + * For GFP_NOWAIT case, the page may be pre-charged before calling + * add_to_page_cache(). (See shmem.c) check it here and avoid to call + * charge twice. (It works but has to pay a bit larger cost.) + */ + if (!(gfp_mask & __GFP_WAIT)) { + struct page_cgroup *pc; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + if (pc) { + VM_BUG_ON(pc->page != page); + VM_BUG_ON(!pc->mem_cgroup); + unlock_page_cgroup(page); + return 0; + } + unlock_page_cgroup(page); + } + if (unlikely(!mm)) mm = &init_mm; + return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); } -- cgit v1.2.2 From cede86acd8bd5d2205dec28db8ac86410a3a19e8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 25 Jul 2008 01:47:18 -0700 Subject: memcg: clean up checking of the disabled flag Those checks are unnecessary, because when the subsystem is disabled it can't be mounted, so those functions won't get called. The check is needed in functions which will be called in other places except cgroup. [hugh@veritas.com: further checking of disabled flag] Signed-off-by: Li Zefan Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b3759bd5494..0c035647d36a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active) struct mem_cgroup_per_zone *mz; unsigned long flags; + if (mem_cgroup_subsys.disabled) + return; + /* * We cannot lock_page_cgroup while holding zone's lru_lock, * because other holders of lock_page_cgroup can be interrupted @@ -533,9 +536,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; - if (mem_cgroup_subsys.disabled) - return 0; - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (unlikely(pc == NULL)) goto err; @@ -620,6 +620,9 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + if (mem_cgroup_subsys.disabled) + return 0; + /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -638,6 +641,9 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + if (mem_cgroup_subsys.disabled) + return 0; + /* * Corner case handling. This is called from add_to_page_cache() * in usual. But some FS (shmem) precharges this page before calling it @@ -788,6 +794,9 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) int progress = 0; int retry = MEM_CGROUP_RECLAIM_RETRIES; + if (mem_cgroup_subsys.disabled) + return 0; + rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); css_get(&mem->css); @@ -857,9 +866,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) int ret = -EBUSY; int node, zid; - if (mem_cgroup_subsys.disabled) - return 0; - css_get(&mem->css); /* * page reclaim code (kswapd etc..) will move pages between @@ -1103,8 +1109,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, static int mem_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) { - if (mem_cgroup_subsys.disabled) - return 0; return cgroup_add_files(cont, ss, mem_cgroup_files, ARRAY_SIZE(mem_cgroup_files)); } @@ -1117,9 +1121,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct mm_struct *mm; struct mem_cgroup *mem, *old_mem; - if (mem_cgroup_subsys.disabled) - return; - mm = get_task_mm(p); if (mm == NULL) return; -- cgit v1.2.2 From 628f42355389cfb596ca3a5a5f64fb9054a2a06a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:20 -0700 Subject: memcg: limit change shrink usage Shrinking memory usage at limit change. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Balbir Singh Acked-by: Pavel Emelyanov Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0c035647d36a..fba566c51322 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -812,6 +812,30 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) return 0; } +int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +{ + + int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int progress; + int ret = 0; + + while (res_counter_set_limit(&memcg->res, val)) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!retry_count) { + ret = -EBUSY; + break; + } + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); + if (!progress) + retry_count--; + } + return ret; +} + + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -896,13 +920,29 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, cft->private); } - +/* + * The user of this function is... + * RES_LIMIT. + */ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { - return res_counter_write(&mem_cgroup_from_cont(cont)->res, - cft->private, buffer, - res_counter_memparse_write_strategy); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret; + + switch (cft->private) { + case RES_LIMIT: + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) + ret = mem_cgroup_resize_limit(memcg, val); + break; + default: + ret = -EINVAL; /* should be BUG() ? */ + break; + } + return ret; } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) -- cgit v1.2.2 From 873b47717732c2f33a4b14de02571a4295a02f0c Mon Sep 17 00:00:00 2001 From: Keika Kobayashi Date: Fri, 25 Jul 2008 01:48:52 -0700 Subject: per-task-delay-accounting: add memory reclaim delay Sometimes, application responses become bad under heavy memory load. Applications take a bit time to reclaim memory. The statistics, how long memory reclaim takes, will be useful to measure memory usage. This patch adds accounting memory reclaim to per-task-delay-accounting for accounting the time of do_try_to_free_pages(). - When System is under low memory load, memory reclaim may not occur. $ free total used free shared buffers cached Mem: 8197800 1577300 6620500 0 4808 1516724 -/+ buffers/cache: 55768 8142032 Swap: 16386292 0 16386292 $ vmstat 1 procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- r b swpd free buff cache si so bi bo in cs us sy id wa 0 0 0 5069748 10612 3014060 0 0 0 0 3 26 0 0 100 0 0 0 0 5069748 10612 3014060 0 0 0 0 4 22 0 0 100 0 0 0 0 5069748 10612 3014060 0 0 0 0 3 18 0 0 100 0 Measure the time of tar command. $ ls -s test.dat 1501472 test.dat $ time tar cvf test.tar test.dat real 0m13.388s user 0m0.116s sys 0m5.304s $ ./delayget -d -p CPU count real total virtual total delay total 428 5528345500 5477116080 62749891 IO count delay total 338 8078977189 SWAP count delay total 0 0 RECLAIM count delay total 0 0 - When system is under heavy memory load memory reclaim may occur. $ vmstat 1 procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- r b swpd free buff cache si so bi bo in cs us sy id wa 0 0 7159032 49724 1812 3012 0 0 0 0 3 24 0 0 100 0 0 0 7159032 49724 1812 3012 0 0 0 0 4 24 0 0 100 0 0 0 7159032 49848 1812 3012 0 0 0 0 3 22 0 0 100 0 In this case, one process uses more 8G memory by execution of malloc() and memset(). $ time tar cvf test.tar test.dat real 1m38.563s <- increased by 85 sec user 0m0.140s sys 0m7.060s $ ./delayget -d -p CPU count real total virtual total delay total 9021 7140446250 7315277975 923201824 IO count delay total 8965 90466349669 SWAP count delay total 3 21036367 RECLAIM count delay total 740 61011951153 In the later case, the value of RECLAIM is increasing. So, taskstats can show how much memory reclaim influences TAT. Signed-off-by: Keika Kobayashi Acked-by: Balbir Singh Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 967d30ccd92b..26672c6cd3ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -1316,6 +1317,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + delayacct_freepages_start(); + if (scan_global_lru(sc)) count_vm_event(ALLOCSTALL); /* @@ -1396,6 +1399,8 @@ out: } else mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + delayacct_freepages_end(); + return ret; } -- cgit v1.2.2 From e44d1b2998d62a1f2f4d7eb17b56ba396535509f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 25 Jul 2008 12:57:41 +0200 Subject: mm/hugetlb.c: fix build failure with !CONFIG_SYSCTL on !CONFIG_SYSCTL on x86 with latest -git i get: mm/hugetlb.c: In function 'decrement_hugepage_resv_vma': mm/hugetlb.c:83: error: 'reserve' undeclared (first use in this function) mm/hugetlb.c:83: error: (Each undeclared identifier is reported only once mm/hugetlb.c:83: error: for each function it appears in.) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41341c414194..a8bf4ab01f86 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1026,6 +1026,17 @@ static void __init report_hugepages(void) } } +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -1375,17 +1386,6 @@ static int __init hugetlb_default_setup(char *s) } __setup("default_hugepagesz=", hugetlb_default_setup); -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.2 From 16d69265b930f7e2fa9eea381715696f780718f4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 25 Jul 2008 19:44:36 -0700 Subject: uninline arch_pick_mmap_layout() Fix this, on avr32: include/linux/utsname.h:35, from init/main.c:20: include/linux/sched.h: In function 'arch_pick_mmap_layout': include/linux/sched.h:2149: error: implicit declaration of function 'PAGE_ALIGN' Reported-by: Adrian Bunk Cc: Haavard Skinnemoen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 8f18683825bc..0efd83097ecf 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -136,3 +137,12 @@ char *strndup_user(const char __user *s, long n) return p; } EXPORT_SYMBOL(strndup_user); + +#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif -- cgit v1.2.2 From 8a21346058ad946134b6ddfeb5de975c3cfcf5da Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Fri, 25 Jul 2008 19:44:37 -0700 Subject: hugetlb: fix CONFIG_SYSCTL=n build Fixes a build failure reported by Alan Cox: mm/hugetlb.c: In function `hugetlb_acct_memory': mm/hugetlb.c:1507: error: implicit declaration of function `cpuset_mems_nr' Also reverts Ingo's commit e44d1b2998d62a1f2f4d7eb17b56ba396535509f Author: Ingo Molnar Date: Fri Jul 25 12:57:41 2008 +0200 mm/hugetlb.c: fix build failure with !CONFIG_SYSCTL which fixed the build error but added some unused-static-function warnings. Signed-off-by: Nishanth Aravamudan Cc: Alan Cox Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8bf4ab01f86..3be79dc18c5c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1026,18 +1026,6 @@ static void __init report_hugepages(void) } } -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) { @@ -1386,6 +1374,18 @@ static int __init hugetlb_default_setup(char *s) } __setup("default_hugepagesz=", hugetlb_default_setup); +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + +#ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.2 From 8174c430e445a93016ef18f717fe570214fa38bf Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:24 -0700 Subject: x86: lockless get_user_pages_fast() Implement get_user_pages_fast without locking in the fastpath on x86. Do an optimistic lockless pagetable walk, without taking mmap_sem or any page table locks or even mmap_sem. Page table existence is guaranteed by turning interrupts off (combined with the fact that we're always looking up the current mm, means we can do the lockless page table walk within the constraints of the TLB shootdown design). Basically we can do this lockless pagetable walk in a similar manner to the way the CPU's pagetable walker does not have to take any locks to find present ptes. This patch (combined with the subsequent ones to convert direct IO to use it) was found to give about 10% performance improvement on a 2 socket 8 core Intel Xeon system running an OLTP workload on DB2 v9.5 "To test the effects of the patch, an OLTP workload was run on an IBM x3850 M2 server with 2 processors (quad-core Intel Xeon processors at 2.93 GHz) using IBM DB2 v9.5 running Linux 2.6.24rc7 kernel. Comparing runs with and without the patch resulted in an overall performance benefit of ~9.8%. Correspondingly, oprofiles showed that samples from __up_read and __down_read routines that is seen during thread contention for system resources was reduced from 2.8% down to .05%. Monitoring the /proc/vmstat output from the patched run showed that the counter for fast_gup contained a very high number while the fast_gup_slow value was zero." (fast_gup is the old name for get_user_pages_fast, fast_gup_slow is a counter we had for the number of times the slowpath was invoked). The main reason for the improvement is that DB2 has multiple threads each issuing direct-IO. Direct-IO uses get_user_pages, and thus the threads contend the mmap_sem cacheline, and can also contend on page table locks. I would anticipate larger performance gains on larger systems, however I think DB2 uses an adaptive mix of threads and processes, so it could be that thread contention remains pretty constant as machine size increases. In which case, we stuck with "only" a 10% gain. The downside of using get_user_pages_fast is that if there is not a pte with the correct permissions for the access, we end up falling back to get_user_pages and so the get_user_pages_fast is a bit of extra work. However this should not be the common case in most performance critical code. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: Kconfig fix] [akpm@linux-foundation.org: Makefile fix/cleanup] [akpm@linux-foundation.org: warning fix] Signed-off-by: Nick Piggin Cc: Dave Kleikamp Cc: Andy Whitcroft Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Cc: Dave Kleikamp Cc: Badari Pulavarty Cc: Zach Brown Cc: Jens Axboe Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index aa799007a11b..efee5d379df4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM +config HAVE_GET_USER_PAGES_FAST + bool + # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows -- cgit v1.2.2 From 30002ed2e41830ec03ec3e577ad83ac6b188f96e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:28 -0700 Subject: mm: readahead scan lockless radix_tree_next_hole() is implemented as a series of radix_tree_lookup()s. So it can be called locklessly, under rcu_read_lock(). Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index d8723a5f6496..77e8ddf945e9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping, if (hit_readahead_marker) { pgoff_t start; - read_lock_irq(&mapping->tree_lock); - start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); + start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + rcu_read_unlock(); if (!start || start - offset > max) return 0; -- cgit v1.2.2 From e286781d5f2e9c846e012a39653a166e9d31777d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:30 -0700 Subject: mm: speculative page references If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). Thanks to Hugh for pointing out an improvement to the algorithm setting page_count to zero when we have control of all references, in order to hold off speculative getters. [kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()] [hugh@veritas.com: fix add_to_page_cache] [akpm@linux-foundation.org: repair a comment] Signed-off-by: Nick Piggin Cc: Jeff Garzik Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 32 ++++++++++++++----------- mm/migrate.c | 20 ++++++++++++++-- mm/shmem.c | 6 ++--- mm/swap_state.c | 17 +++++++++---- mm/vmscan.c | 74 +++++++++++++++++++++++++++++++++++++++++---------------- 5 files changed, 105 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2d3ec1ffc66e..4e182a9a14c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping, } /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & ~__GFP_HIGHMEM); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } else + } else { + page->mapping = NULL; mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); @@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) diff --git a/mm/migrate.c b/mm/migrate.c index d8c65a65c61d..3ca6392e82cc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, page = migration_entry_to_page(entry); - get_page(page); + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; pte_unmap_unlock(ptep, ptl); wait_on_page_locked(page); put_page(page); @@ -305,6 +313,7 @@ out: static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + int expected_count; void **pslot; if (!mapping) { @@ -319,12 +328,18 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - if (page_count(page) != 2 + !!PagePrivate(page) || + expected_count = 2 + !!PagePrivate(page); + if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; } + if (!page_freeze_refs(page, expected_count)) { + write_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + /* * Now we know that no one else is looking at the page. */ @@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); + page_unfreeze_refs(page, expected_count); /* * Drop cache reference from old page. * We know this isn't the last reference. diff --git a/mm/shmem.c b/mm/shmem.c index f92fea94d037..1089092aecaf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -936,7 +936,7 @@ found: spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); if (ptr && ptr->val == entry.val) { - error = add_to_page_cache(page, inode->i_mapping, + error = add_to_page_cache_locked(page, inode->i_mapping, idx, GFP_NOWAIT); /* does mem_cgroup_uncharge_cache_page on error */ } else /* we must compensate for our precharge above */ @@ -1301,8 +1301,8 @@ repeat: SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); - } else if (!(error = add_to_page_cache( - swappage, mapping, idx, GFP_NOWAIT))) { + } else if (!(error = add_to_page_cache_locked(swappage, mapping, + idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0ba..3e3381d6c7ee 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -64,7 +64,7 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) @@ -76,19 +76,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + write_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); + if (likely(!error)) { total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); + + if (unlikely(error)) { + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } } return error; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 26672c6cd3ce..0075eac1cd04 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -391,12 +391,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } /* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. */ -int remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -427,24 +425,24 @@ int remove_mapping(struct address_space *mapping, struct page *page) * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (unlikely(page_count(page) != 2)) + if (!page_freeze_refs(page, 2)) goto cannot_free; - smp_rmb(); - if (unlikely(PageDirty(page))) + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); goto cannot_free; + } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + } else { + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); } - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); return 1; cannot_free: @@ -452,6 +450,26 @@ cannot_free: return 0; } +/* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } + return 0; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PagePrivate(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) - goto free_it; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } } - if (!mapping || !remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; -free_it: unlock_page(page); +free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) - __pagevec_release_nonlru(&freed_pvec); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } continue; activate_locked: @@ -623,7 +657,7 @@ keep: } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) - __pagevec_release_nonlru(&freed_pvec); + __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } -- cgit v1.2.2 From a60637c85893e7191faaafa6a72e197c24386727 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:31 -0700 Subject: mm: lockless pagecache Combine page_cache_get_speculative with lockless radix tree lookups to introduce lockless page cache lookups (ie. no mapping->tree_lock on the read-side). The only atomicity changes this introduces is that the gang pagecache lookup functions now behave as if they are implemented with multiple find_get_page calls, rather than operating on a snapshot of the pages. In practice, this atomicity guarantee is not used anyway, and it is to replace individual lookups, so these semantics are natural. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 179 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 134 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4e182a9a14c0..feb8448d8618 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -637,15 +637,35 @@ void __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -660,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON(page->index != offset); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -751,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -778,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -810,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); -- cgit v1.2.2 From 19fd6231279be3c3bdd02ed99f9b0eb195978064 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:32 -0700 Subject: mm: spinlock tree_lock mapping->tree_lock has no read lockers. convert the lock from an rwlock to a spinlock. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 +++++----- mm/migrate.c | 11 +++++------ mm/page-writeback.c | 12 ++++++------ mm/swap_state.c | 10 +++++----- mm/swapfile.c | 4 ++-- mm/truncate.c | 6 +++--- mm/vmscan.c | 8 ++++---- 7 files changed, 30 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index feb8448d8618..2ed8b0389c51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -109,7 +109,7 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { @@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page) BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } static int sync_page(void *word) @@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->mapping = mapping; page->index = offset; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (likely(!error)) { mapping->nrpages++; @@ -480,7 +480,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page_cache_release(page); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index 3ca6392e82cc..153572fb60b8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -323,7 +323,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, return 0; } - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); @@ -331,12 +331,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + !!PagePrivate(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } if (!page_freeze_refs(page, expected_count)) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -373,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping, __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - write_unlock_irq(&mapping->tree_lock); - if (!PageSwapCache(newpage)) { + spin_unlock_irq(&mapping->tree_lock); + if (!PageSwapCache(newpage)) mem_cgroup_uncharge_cache_page(page); - } return 0; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94c6d8988ab3..24de8b65fdbd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (!mapping) return 1; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) { radix_tree_tag_clear(&mapping->page_tree, @@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page) __bdi_writeout_inc(bdi); } } - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); } @@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) { radix_tree_tag_set(&mapping->page_tree, @@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } diff --git a/mm/swap_state.c b/mm/swap_state.c index 3e3381d6c7ee..2c217e33d497 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), + .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, @@ -80,7 +80,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) SetPageSwapCache(page); set_page_private(page, entry.val); - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (likely(!error)) { @@ -88,7 +88,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); if (unlikely(error)) { @@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2f33edb8bee9..af283933c14e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page) retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); } spin_unlock(&swap_lock); diff --git a/mm/truncate.c b/mm/truncate.c index b8961cb63414..e68443d74567 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; failed: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 0075eac1cd04..8f71761bc4b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -399,7 +399,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. * @@ -436,17 +436,17 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); } else { __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } -- cgit v1.2.2 From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 25 Jul 2008 19:45:34 -0700 Subject: SL*B: drop kmem cache argument from constructor Kmem cache passed to constructor is only needed for constructors that are themselves multiplexeres. Nobody uses this "feature", nor does anybody uses passed kmem cache in non-trivial way, so pass only pointer to object. Non-trivial places are: arch/powerpc/mm/init_64.c arch/powerpc/mm/hugetlbpage.c This is flag day, yes. Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Acked-by: Christoph Lameter Cc: Jon Tollefson Cc: Nick Piggin Cc: Matt Mackall [akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c] [akpm@linux-foundation.org: fix mm/slab.c] [akpm@linux-foundation.org: fix ubifs] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- mm/shmem.c | 2 +- mm/slab.c | 11 +++++------ mm/slob.c | 7 +++---- mm/slub.c | 13 ++++++------- 5 files changed, 16 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index abbd29f7c43f..39ae5a9bf382 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(struct kmem_cache *cachep, void *data) +static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; diff --git a/mm/shmem.c b/mm/shmem.c index 1089092aecaf..952d361774bb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2352,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; diff --git a/mm/slab.c b/mm/slab.c index 052e7d64537e..918f04f7fef1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -406,7 +406,7 @@ struct kmem_cache { unsigned int dflags; /* dynamic flags */ /* constructor func */ - void (*ctor)(struct kmem_cache *, void *); + void (*ctor)(void *obj); /* 5) cache creation/removal */ const char *name; @@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep, * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) - cachep->ctor(cachep, objp + obj_offset(cachep)); + cachep->ctor(objp + obj_offset(cachep)); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep, cachep->buffer_size / PAGE_SIZE, 0); #else if (cachep->ctor) - cachep->ctor(cachep, objp); + cachep->ctor(objp); #endif slab_bufctl(slabp)[i] = i + 1; } @@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) - cachep->ctor(cachep, objp); + cachep->ctor(objp); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", diff --git a/mm/slob.c b/mm/slob.c index de268eb7ac70..d8fbd4d1bfa7 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -525,12 +525,11 @@ struct kmem_cache { unsigned int size, align; unsigned long flags; const char *name; - void (*ctor)(struct kmem_cache *, void *); + void (*ctor)(void *); }; struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *c; @@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) b = slob_new_page(flags, get_order(c->size), node); if (c->ctor) - c->ctor(c, b); + c->ctor(b); return b; } diff --git a/mm/slub.c b/mm/slub.c index 77c21cf53ff9..b7e2cd5d82db 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug); static unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. @@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { return flags; } @@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(s, object); + s->ctor(object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { memset(s, 0, kmem_size); s->name = name; @@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s) static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { struct kmem_cache *s; @@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size, } struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; -- cgit v1.2.2 From 4c8573e25f27b60b495aaa23089032f685ffd5ba Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 25 Jul 2008 19:45:37 -0700 Subject: Use WARN() in mm/vmalloc.c Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35f293816294..85b9a0d2c877 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages) return; if ((PAGE_SIZE-1) & (unsigned long)addr) { - printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } area = remove_vm_area(addr); if (unlikely(!area)) { - printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); - WARN_ON(1); return; } -- cgit v1.2.2 From fa8e26ccd485216fc45c8c2dd1ec3b7ef1a0a2f8 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:50 -0700 Subject: tracehook: tracehook_expect_breakpoints This adds tracehook_expect_breakpoints() as a formal hook for the nommu code to use for its, "Is text-poking likely?" check at mmap time. This names the actual semantics the code means to test, and documents it. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 4462b6a3fcb9..5edccd9c9218 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -745,7 +745,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) + if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) vm_flags &= ~VM_MAYSHARE; return vm_flags; -- cgit v1.2.2 From 2c97b7fc0d8c8661981beb9517da342ced3b3bc7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 25 Jul 2008 19:46:01 -0700 Subject: mm: print swapcache page count in show_swap_cache_info() Every arch implements its own show_mem() function. Most of them share quite some code, some of them are completely identical. This series implements a generic version of this function and migrates almost all architectures to it. This patch: Most show_mem() implementations calculate the amount of pages within the swapcache every time. Move the output to a more appropriate place and use the anyway available total_swapcache_pages variable. Signed-off-by: Johannes Weiner Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Haavard Skinnemoen Cc: Bryan Wu Cc: Chris Zankel Cc: Ingo Molnar Cc: Jeff Dike Cc: David S. Miller Cc: Paul Mundt Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Yoshinori Sato Cc: Ralf Baechle Cc: Greg Ungerer Cc: Geert Uytterhoeven Cc: Roman Zippel Cc: Hirokazu Takata Cc: Mikael Starvik Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 2c217e33d497..b8035b055129 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -56,7 +56,8 @@ static struct { void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", + printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); -- cgit v1.2.2 From 9e5c6da71e89fa25ced6e88182225a99941bec90 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:22 -0700 Subject: make mm/sparse.c: make a function static This patch makes the needlessly global sparse_early_mem_map_alloc() static. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 8ffc08990008..5d9dbbb9d39e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -377,7 +377,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; struct mem_section *ms = __nr_to_section(pnum); -- cgit v1.2.2 From 9d8fddfb17aaee4ffc5e3d0560620d0fa8b50a42 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:23 -0700 Subject: mm/allocpercpu.c: make 4 functions static This patch makes the following needlessly global functions static: - percpu_depopulate() - __percpu_depopulate_mask() - percpu_populate() - __percpu_populate_mask() Signed-off-by: Adrian Bunk Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/allocpercpu.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 843364594e23..4297bc41bfd2 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -18,27 +18,28 @@ * Depopulating per-cpu data for a cpu going offline would be a typical * use case. You need to register a cpu hotplug handler for that purpose. */ -void percpu_depopulate(void *__pdata, int cpu) +static void percpu_depopulate(void *__pdata, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); kfree(pdata->ptrs[cpu]); pdata->ptrs[cpu] = NULL; } -EXPORT_SYMBOL_GPL(percpu_depopulate); /** * percpu_depopulate_mask - depopulate per-cpu data for some cpu's * @__pdata: per-cpu data to depopulate * @mask: depopulate per-cpu data for cpu's selected through mask bits */ -void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) { int cpu; for_each_cpu_mask_nr(cpu, *mask) percpu_depopulate(__pdata, cpu); } -EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); + +#define percpu_depopulate_mask(__pdata, mask) \ + __percpu_depopulate_mask((__pdata), &(mask)) /** * percpu_populate - populate per-cpu data for given cpu @@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); * use case. You need to register a cpu hotplug handler for that purpose. * Per-cpu object is populated with zeroed buffer. */ -void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) +static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); int node = cpu_to_node(cpu); @@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) pdata->ptrs[cpu] = kzalloc(size, gfp); return pdata->ptrs[cpu]; } -EXPORT_SYMBOL_GPL(percpu_populate); /** * percpu_populate_mask - populate per-cpu data for more cpu's @@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate); * * Per-cpu objects are populated with zeroed buffers. */ -int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) +static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask) { cpumask_t populated; int cpu; @@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, cpu_set(cpu, populated); return 0; } -EXPORT_SYMBOL_GPL(__percpu_populate_mask); + +#define percpu_populate_mask(__pdata, size, gfp, mask) \ + __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** * percpu_alloc_mask - initial setup of per-cpu data -- cgit v1.2.2 From 15f59adae001766a2c7f7fe4f196387bb04bcff5 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:23 -0700 Subject: make mm/memory.c:print_bad_pte() static This patch makes the needlessly global print_bad_pte() static. Signed-off-by: Adrian Bunk Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 262e3eb6601a..a8ca04faaea6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, + unsigned long vaddr) { printk(KERN_ERR "Bad pte = %08llx, process = %s, " "vm_flags = %lx, vaddr = %lx\n", -- cgit v1.2.2 From 7c363b8c6536f26934172d3c46f0bbec01a97c61 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:24 -0700 Subject: mm/swapfile.c: make code static This patch makes the following needlessly global code static: - swap_lock - nr_swapfiles - struct swap_list Signed-off-by: Adrian Bunk Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index af283933c14e..6beb6251e99d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -33,8 +33,8 @@ #include #include -DEFINE_SPINLOCK(swap_lock); -unsigned int nr_swapfiles; +static DEFINE_SPINLOCK(swap_lock); +static unsigned int nr_swapfiles; long total_swap_pages; static int swap_overflow; static int least_priority; @@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +static struct swap_list_t swap_list = {-1, -1}; static struct swap_info_struct swap_info[MAX_SWAPFILES]; -- cgit v1.2.2 From 93bc4e89c260d91576840c4881d1066d84ccd422 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 26 Jul 2008 17:49:33 -0700 Subject: netfilter: fix double-free and use-after free As suggested by Patrick McHardy, introduce a __krealloc() that doesn't free the original buffer to fix a double-free and use-after-free bug introduced by me in netfilter that uses RCU. Reported-by: Patrick McHardy Signed-off-by: Pekka Enberg Tested-by: Dieter Ries Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- mm/util.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 8f18683825bc..6ef9e9943f62 100644 --- a/mm/util.c +++ b/mm/util.c @@ -68,25 +68,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** - * krealloc - reallocate memory. The contents will remain unchanged. + * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. * @flags: the type of memory to allocate. * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *__krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks = 0; - if (unlikely(!new_size)) { - kfree(p); + if (unlikely(!new_size)) return ZERO_SIZE_PTR; - } if (p) ks = ksize(p); @@ -95,10 +92,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; ret = kmalloc_track_caller(new_size, flags); - if (ret && p) { + if (ret && p) memcpy(ret, p, ks); + + return ret; +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { kfree(p); + return ZERO_SIZE_PTR; } + + ret = __krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + return ret; } EXPORT_SYMBOL(krealloc); -- cgit v1.2.2 From e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Jul 2008 21:03:57 -0400 Subject: [PATCH] sanitize ->permission() prototype * kill nameidata * argument; map the 3 bits in ->flags anybody cares about to new MAY_... ones and pass with the mask. * kill redundant gfs2_iop_permission() * sanitize ecryptfs_permission() * fix remaining places where ->permission() instances might barf on new MAY_... found in mask. The obvious next target in that direction is permission(9) folded fix for nfs_permission() breakage from Miklos Szeredi Signed-off-by: Al Viro --- mm/shmem_acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index f5664c5b9eb1..8e5aadd7dcd6 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask) * shmem_permission - permission() inode operation */ int -shmem_permission(struct inode *inode, int mask, struct nameidata *nd) +shmem_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, shmem_check_acl); } -- cgit v1.2.2 From 2f1936b87783a3a56c9441b27b9ba7a747f11e8e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 24 Jun 2008 16:50:14 +0200 Subject: [patch 3/5] vfs: change remove_suid() to file_remove_suid() All calls to remove_suid() are made with a file pointer, because (similarly to file_update_time) it is called when the file is written. Clean up callers by passing in a file instead of a dentry. Signed-off-by: Miklos Szeredi --- mm/filemap.c | 7 ++++--- mm/filemap_xip.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2ed8b0389c51..5de7633e1dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1758,8 +1758,9 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs); } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file) { + struct dentry *dentry = file->f_path.dentry; int killsuid = should_remove_suid(dentry); int killpriv = security_inode_need_killpriv(dentry); int error = 0; @@ -1773,7 +1774,7 @@ int remove_suid(struct dentry *dentry) return error; } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) @@ -2529,7 +2530,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 3e744abcce9d..98a3f31ccd6a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -380,7 +380,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (count == 0) goto out_backing; - ret = remove_suid(filp->f_path.dentry); + ret = file_remove_suid(filp); if (ret) goto out_backing; -- cgit v1.2.2 From 3b8f14b41026fb7d7e9a4af2a4128a702d07ad26 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 26 Jul 2008 15:22:28 -0700 Subject: mm/util.c must #include mm/util.c: In function 'arch_pick_mmap_layout': mm/util.c:144: error: dereferencing pointer to incomplete type mm/util.c:145: error: 'arch_get_unmapped_area' undeclared (first use in this function) mm/util.c:145: error: (Each undeclared identifier is reported only once mm/util.c:145: error: for each function it appears in.) mm/util.c:146: error: 'arch_unmap_area' undeclared (first use in this function) Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 0efd83097ecf..d8d02210e06c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -3,6 +3,7 @@ #include #include #include +#include #include /** -- cgit v1.2.2 From 9b1a4d38373a5581a4e01032a3ccdd94cd93477b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine: Wean existing callers off stop_machine_run() Signed-off-by: Rusty Russell --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6da667274df5..3cf3d05b6bd4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2372,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -/* return values int ....just for stop_machine_run() */ +/* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; @@ -2397,7 +2397,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.2 From 14fcc23fdc78e9d32372553ccf21758a9bd56fa1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 28 Jul 2008 15:46:19 -0700 Subject: tmpfs: fix kernel BUG in shmem_delete_inode SuSE's insserve initscript ordering program hits kernel BUG at mm/shmem.c:814 on 2.6.26. It's using posix_fadvise on directories, and the shmem_readpage method added in 2.6.23 is letting POSIX_FADV_WILLNEED allocate useless pages to a tmpfs directory, incrementing i_blocks count but never decrementing it. Fix this by assigning shmem_aops (pointing to readpage and writepage and set_page_dirty) only when it's needed, on a regular file or a long symlink. Many thanks to Kel for outstanding bugreport and steps to reproduce it. Reported-by: Kel Modderman Tested-by: Kel Modderman Signed-off-by: Hugh Dickins Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 952d361774bb..c1e5a3b4f758 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_blocks = 0; - inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); @@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) init_special_inode(inode, mode, dev); break; case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, @@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return error; } unlock_page(page); + inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); -- cgit v1.2.2 From 7906d00cd1f687268f0a3599442d113767795ae6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:26 -0700 Subject: mmu-notifiers: add mm_take_all_locks() operation mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Linus Torvalds Cc: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5e0cc99e9cd5..e5f9cb83d6d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2268,3 +2268,161 @@ int install_special_mapping(struct mm_struct *mm, return 0; } + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + spin_lock(&anon_vma->lock); + /* + * We can safely modify head.next after taking the + * anon_vma->lock. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->lock. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + } +} + +static void vm_lock_mapping(struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + spin_lock(&mapping->i_mmap_lock); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_sem in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_sem until mm_drop_all_locks() returns. + * + * mmap_sem in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout (for example populate_range() with + * nonlinear vmas). It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We can take all the locks in random order because the VM code + * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * takes more than one of them in a row. Secondly we're protected + * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = -EINTR; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + + mutex_lock(&mm_all_locks_mutex); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_lock_mapping(vma->vm_file->f_mapping); + } + ret = 0; + +out_unlock: + if (ret) + mm_drop_all_locks(mm); + + return ret; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->head will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->lock. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + spin_unlock(&anon_vma->lock); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + spin_unlock(&mapping->i_mmap_lock); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_sem cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma) + vm_unlock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} -- cgit v1.2.2 From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:29 -0700 Subject: mmu-notifiers: core With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 3 + mm/Makefile | 1 + mm/filemap_xip.c | 3 +- mm/fremap.c | 3 + mm/hugetlb.c | 3 + mm/memory.c | 35 +++++-- mm/mmap.c | 2 + mm/mmu_notifier.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 3 + mm/mremap.c | 6 ++ mm/rmap.c | 13 +-- 11 files changed, 336 insertions(+), 13 deletions(-) create mode 100644 mm/mmu_notifier.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index efee5d379df4..446c6588c753 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -208,3 +208,6 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config MMU_NOTIFIER + bool diff --git a/mm/Makefile b/mm/Makefile index 06ca2381fef1..da4ccf015aea 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 98a3f31ccd6a..380ab402d711 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping, if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7881638e4a12 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, spin_unlock(&mapping->i_mmap_lock); } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); + mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (unlikely(has_write_lock)) { downgrade_write(&mm->mmap_sem); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3be79dc18c5c..80eb0d31d0d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1672,6 +1673,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -1713,6 +1715,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); diff --git a/mm/memory.c b/mm/memory.c index a8ca04faaea6..67f0ab9077d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -652,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + int ret; /* * Don't copy ptes where a page fault will fill them correctly. @@ -667,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + + ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) - return -ENOMEM; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - return 0; + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -881,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -946,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } } out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -1616,10 +1637,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long start = addr, end = addr + size; int err; BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1627,6 +1649,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1839,7 +1862,7 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); + ptep_clear_flush_notify(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); diff --git a/mm/mmap.c b/mm/mmap.c index e5f9cb83d6d4..245c3d69067b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -2061,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ arch_exit_mmap(mm); + mmu_notifier_release(mm); lru_add_drain(); flush_cache_mm(mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,277 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * This function can't run concurrently against mmu_notifier_register + * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap + * runs with mm_users == 0. Other tasks may still invoke mmu notifiers + * in parallel despite there being no task using this mm any more, + * through the vmas outside of the exit_mmap context, such as with + * vmtruncate. This serializes against mmu_notifier_unregister with + * the mmu_notifier_mm->lock in addition to RCU and it serializes + * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * can't go away from under us as exit_mmap holds an mm_count pin + * itself. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { + mn = hlist_entry(mm->mmu_notifier_mm->list.first, + struct mmu_notifier, + hlist); + /* + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than + * to wait ->release to finish and + * mmu_notifier_unregister to return. + */ + hlist_del_init_rcu(&mn->hlist); + /* + * RCU here will block mmu_notifier_unregister until + * ->release returns. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * if ->release runs before mmu_notifier_unregister it + * must be handled as it's the only way for the driver + * to flush all existing sptes and stop the driver + * from establishing any more sptes before all the + * pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + spin_lock(&mm->mmu_notifier_mm->lock); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * synchronize_rcu here prevents mmu_notifier_release to + * return to exit_mmap (which would proceed freeing all pages + * in the mm) until the ->release method returns, if it was + * invoked by mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one + * mm_count is hold by exit_mmap. + */ + synchronize_rcu(); +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, address); + } + rcu_read_unlock(); + + return young; +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } + rcu_read_unlock(); +} + +static int do_mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm, + int take_mmap_sem) +{ + struct mmu_notifier_mm *mmu_notifier_mm; + int ret; + + BUG_ON(atomic_read(&mm->mm_users) <= 0); + + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + + if (take_mmap_sem) + down_write(&mm->mmap_sem); + ret = mm_take_all_locks(mm); + if (unlikely(ret)) + goto out_cleanup; + + if (!mm_has_notifiers(mm)) { + INIT_HLIST_HEAD(&mmu_notifier_mm->list); + spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; + } + atomic_inc(&mm->mm_count); + + /* + * Serialize the update against mmu_notifier_unregister. A + * side note: mmu_notifier_release can't run concurrently with + * us because we hold the mm_users pin (either implicitly as + * current->mm or explicitly with get_task_mm() or similar). + * We can't race against any other mmu notifier method either + * thanks to mm_take_all_locks(). + */ + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + + mm_drop_all_locks(mm); +out_cleanup: + if (take_mmap_sem) + up_write(&mm->mmap_sem); + /* kfree() does nothing if mmu_notifier_mm is NULL */ + kfree(mmu_notifier_mm); +out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return ret; +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. mmu_notifier_unregister must be always called to + * unregister the notifier. mm_count is automatically pinned to allow + * mmu_notifier_unregister to safely run at any time later, before or + * after exit_mmap. ->release will always be called before exit_mmap + * frees the pages. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 1); +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 0); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); + +/* this is called after the last mmu_notifier_unregister() returned */ +void __mmu_notifier_mm_destroy(struct mm_struct *mm) +{ + BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); + kfree(mm->mmu_notifier_mm); + mm->mmu_notifier_mm = LIST_POISON1; /* debug */ +} + +/* + * This releases the mm_count pin automatically and frees the mm + * structure if it was the last user of it. It serializes against + * running mmu notifiers with RCU and against mmu_notifier_unregister + * with the unregister lock + RCU. All sptes must be dropped before + * calling mmu_notifier_unregister. ->release or any other notifier + * method may be invoked concurrently with mmu_notifier_unregister, + * and only after mmu_notifier_unregister returned we're guaranteed + * that ->release or any other method can't run anymore. + */ +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + spin_lock(&mm->mmu_notifier_mm->lock); + if (!hlist_unhashed(&mn->hlist)) { + hlist_del_rcu(&mn->hlist); + + /* + * RCU here will force exit_mmap to wait ->release to finish + * before freeing the pages. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * exit_mmap will block in mmu_notifier_release to + * guarantee ->release is called before freeing the + * pages. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + } else + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * Wait any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. + */ + synchronize_rcu(); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); diff --git a/mm/mprotect.c b/mm/mprotect.c index abd645a3b0a0..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -203,10 +204,12 @@ success: dirty_accountable = 1; } + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..1a7743923c8c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + unsigned long old_start; + old_start = old_addr; + mmu_notifier_invalidate_range_start(vma->vm_mm, + old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); + mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) diff --git a/mm/rmap.c b/mm/rmap.c index 39ae5a9bf382..99bc3f9cd796 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page, if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young(vma, address, pte)) + } else if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); + entry = ptep_clear_flush_notify(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -705,14 +706,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte)))) { + (ptep_clear_flush_young_notify(vma, address, pte)))) { ret = SWAP_FAIL; goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -837,12 +838,12 @@ static void try_to_unmap_cluster(unsigned long cursor, page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young(vma, address, pte)) + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) -- cgit v1.2.2 From 78a34ae29bf1c9df62a5bd0f0798b6c62a54d520 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 28 Jul 2008 15:46:30 -0700 Subject: mm/hugetlb.c must #include This patch fixes the following build error on sh caused by commit aa888a74977a8f2120ae9332376e179c39a6b07d ("hugetlb: support larger than MAX_ORDER"): mm/hugetlb.c: In function 'alloc_bootmem_huge_page': mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys' Signed-off-by: Adrian Bunk Cc: Hirokazu Takata Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 80eb0d31d0d3..254ce2b90158 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -20,6 +20,7 @@ #include #include +#include #include #include "internal.h" -- cgit v1.2.2 From 8ab22b9abb5c55413802e4adc9aa6223324547c3 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 28 Jul 2008 15:46:36 -0700 Subject: vfs: pagecache usage optimization for pagesize!=blocksize When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include #include #include #include #include #include #include #include #include #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi Cc: Nick Piggin Cc: Christoph Hellwig Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 5de7633e1dbe..42bbc6909ba4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1023,8 +1023,17 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (TestSetPageLocked(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -1094,6 +1103,7 @@ page_not_up_to_date: if (lock_page_killable(page)) goto readpage_eio; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); -- cgit v1.2.2 From 7cb93181629c613ee2b8f4ffe3446f8003074842 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 30 Jul 2008 02:18:26 +0900 Subject: mm/hugetlb.c must #include This patch fixes the following build error on sh caused by commit aa888a74977a8f2120ae9332376e179c39a6b07d (hugetlb: support larger than MAX_ORDER): <-- snip --> ... CC mm/hugetlb.o /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/hugetlb.c: In function 'alloc_bootmem_huge_page': /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys' make[2]: *** [mm/hugetlb.o] Error 1 <-- snip --> Reported-by: Adrian Bunk Signed-off-by: Adrian Bunk Signed-off-by: Paul Mundt --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3be79dc18c5c..b3c78640b629 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -16,7 +16,7 @@ #include #include #include - +#include #include #include -- cgit v1.2.2 From 231367fd9bccbb36309ab5bf5012e11a84231031 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 22 Jul 2008 20:21:16 +0300 Subject: mm: unexport ksize This patch removes the obsolete and no longer used exports of ksize. Signed-off-by: Adrian Bunk Signed-off-by: Pekka Enberg --- mm/slab.c | 1 - mm/slob.c | 1 - mm/slub.c | 1 - 3 files changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 918f04f7fef1..e76eee466886 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4472,4 +4472,3 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } -EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index d8fbd4d1bfa7..4c82dd41f32e 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -519,7 +519,6 @@ size_t ksize(const void *block) else return sp->page.private; } -EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; diff --git a/mm/slub.c b/mm/slub.c index b7e2cd5d82db..c26d4c36fba9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2715,7 +2715,6 @@ size_t ksize(const void *object) */ return s->size; } -EXPORT_SYMBOL(ksize); void kfree(const void *x) { -- cgit v1.2.2 From 1d1958f05095a7e9ecbba86235122784a3d1b561 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 29 Jul 2008 22:33:16 -0700 Subject: mm: remove find_max_pfn_with_active_regions It has no user now Also print out info about adding/removing active regions. Signed-off-by: Yinghai Lu Acked-by: Mel Gorman Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3cf3d05b6bd4..401d104d2bb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3753,23 +3753,6 @@ unsigned long __init find_min_pfn_with_active_regions(void) return find_min_pfn_for_node(MAX_NUMNODES); } -/** - * find_max_pfn_with_active_regions - Find the maximum PFN registered - * - * It returns the maximum PFN based on information provided via - * add_active_range(). - */ -unsigned long __init find_max_pfn_with_active_regions(void) -{ - int i; - unsigned long max_pfn = 0; - - for (i = 0; i < nr_nodemap_entries; i++) - max_pfn = max(max_pfn, early_node_map[i].end_pfn); - - return max_pfn; -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. -- cgit v1.2.2 From 4ef1b0fd61333b3b81ebe29283898c6c84b15c9f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:20 -0700 Subject: memcg: remove redundant check in move_task() It's guaranteed by cgroup that old_cgrp != cgrp. Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fba566c51322..7056c3bdb478 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1168,9 +1168,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, mem = mem_cgroup_from_cont(cont); old_mem = mem_cgroup_from_cont(old_cont); - if (mem == old_mem) - goto out; - /* * Only thread group leaders are allowed to migrate, the mm_struct is * in effect owned by the leader -- cgit v1.2.2 From 7e6cbea39aaa32480145915751119227f29f6f7b Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 29 Jul 2008 22:33:39 -0700 Subject: madvise: update function comment of madvise_dontneed Signed-off-by: Fernando Luis Vazquez Cao Cc: Hugh Dickins Cc: Nick Piggin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 23a0ec3e0ea0..f9349c18a1b5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for refill_inactive to actually free + * zap_page_range call sets things up for shrink_active_list to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for - * refill_inactive to pick up before reclaiming other pages. + * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for -- cgit v1.2.2 From ab33dc09a5c0d2bd6757afa1c2f804c9657daec0 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 29 Jul 2008 22:33:40 -0700 Subject: swap: update function comment of release_pages Signed-off-by: Fernando Luis Vazquez Cao Cc: Hugh Dickins Cc: Nick Piggin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index dd89234ee51f..7417a2adbe50 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -278,9 +278,10 @@ int lru_add_drain_all(void) * Avoid taking zone->lru_lock if possible, but if it is taken, retain it * for the remainder of the operation. * - * The locking in this function is against shrink_cache(): we recheck the - * page count inside the lock to see whether shrink_cache grabbed the page - * via the LRU. If it did, give up: shrink_cache will free it. + * The locking in this function is against shrink_inactive_list(): we recheck + * the page count inside the lock to see whether shrink_inactive_list() + * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() + * will free it. */ void release_pages(struct page **pages, int nr, int cold) { -- cgit v1.2.2 From 7d03431cf98aaed635524024273668bb8cedadda Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 29 Jul 2008 22:33:41 -0700 Subject: swapfile/vmscan: update comments related to vmscan functions Signed-off-by: Fernando Luis Vazquez Cao Cc: Hugh Dickins Cc: Nick Piggin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- mm/vmscan.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6beb6251e99d..bb7f79641f9e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -656,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm, if (!down_read_trylock(&mm->mmap_sem)) { /* - * Activate page so shrink_cache is unlikely to unmap its - * ptes while lock is dropped, so swapoff can make progress. + * Activate page so shrink_inactive_list is unlikely to unmap + * its ptes while lock is dropped, so swapoff can make progress. */ activate_page(page); unlock_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 8f71761bc4b7..48ffb13f0d07 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1979,7 +1979,7 @@ module_init(kswapd_init) int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 -#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ -- cgit v1.2.2 From 87547ee95d81ec0ee1503fcaf9c9594469bc2510 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 29 Jul 2008 22:33:42 -0700 Subject: do_try_to_free_page: update comments related to vmscan functions Signed-off-by: Fernando Luis Vazquez Cao Cc: Hugh Dickins Cc: Nick Piggin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 48ffb13f0d07..75be453628bf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1408,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } - /* top priority shrink_caches still had more to do? don't OOM, then */ + /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scan_global_lru(sc)) ret = nr_reclaimed; out: -- cgit v1.2.2 From c627f9cc046c7cd93b4525d89377fb409e170a18 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 29 Jul 2008 22:33:53 -0700 Subject: mm: add zap_vma_ptes(): a library function to unmap driver ptes zap_vma_ptes() is intended to be used by drivers to unmap ptes assigned to the driver private vmas. This interface is similar to zap_page_range() but is less general & less likely to be abused. Needed by the GRU driver. Signed-off-by: Jack Steiner Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 67f0ab9077d9..6793b9c68107 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -994,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, return end; } +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + * Returns 0 if successful. + */ +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (address < vma->vm_start || address + size > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + return -1; + zap_page_range(vma, address, size, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + /* * Do a quick page-table lookup for a single page. */ -- cgit v1.2.2 From 0d39741a27d86d305cc75ba626392be410dcbab9 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 29 Jul 2008 22:34:01 -0700 Subject: GRU Driver: export is_uv_system(), zap_page_range() & follow_page() Exports needed by the GRU driver. Signed-off-by: Jack Steiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6793b9c68107..0e4eea10c7b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -993,6 +993,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, tlb_finish_mmu(tlb, address, end); return end; } +EXPORT_SYMBOL_GPL(zap_page_range); /** * zap_vma_ptes - remove ptes mapping the vma @@ -1110,6 +1111,7 @@ no_page_table: } return page; } +EXPORT_SYMBOL_GPL(follow_page); /* Can we do the FOLL_ANON optimization? */ static inline int use_zero_page(struct vm_area_struct *vma) -- cgit v1.2.2 From 94ad374a0751f40d25e22e036c37f7263569d24c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 30 Jul 2008 14:45:12 -0700 Subject: Fix off-by-one error in iov_iter_advance() The iov_iter_advance() function would look at the iov->iov_len entry even though it might have iterated over the whole array, and iov was pointing past the end. This would cause DEBUG_PAGEALLOC to trigger a kernel page fault if the allocation was at the end of a page, and the next page was unallocated. The quick fix is to just change the order of the tests: check that there is any iovec data left before we check the iov entry itself. Thanks to Alexey Dobriyan for finding this case, and testing the fix. Reported-and-tested-by: Alexey Dobriyan Cc: Nick Piggin Cc: Andrew Morton Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 42bbc6909ba4..d97d1ad55473 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1879,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) * The !iov->iov_len check ensures we skip over unlikely * zero-length segments (without overruning the iovec). */ - while (bytes || unlikely(!iov->iov_len && i->count)) { + while (bytes || unlikely(i->count && !iov->iov_len)) { int copy; copy = min(bytes, iov->iov_len - base); -- cgit v1.2.2 From a4b526b3ba6353cd89a38e41da48ed83b0ead16f Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 1 Aug 2008 16:39:12 +0200 Subject: [S390] Optimize storage key operations for anon pages For anonymous pages without a swap cache backing the check in page_remove_rmap for the physical dirty bit in page_remove_rmap is unnecessary. The instructions that are used to check and reset the dirty bit are expensive. Removing the check noticably speeds up process exit. In addition the clearing of the dirty bit in __SetPageUptodate is pointless as well. With these two changes there is no storage key operation for an anonymous page anymore if it does not hit the swap space. The micro benchmark which repeatedly executes an empty shell script gets about 5% faster. Signed-off-by: Martin Schwidefsky --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 99bc3f9cd796..94a5246a3f98 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -667,7 +667,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - if (page_test_dirty(page)) { + if ((!PageAnon(page) || PageSwapCache(page)) && + page_test_dirty(page)) { page_clear_dirty(page); set_page_dirty(page); } -- cgit v1.2.2 From 0ef89d25d3e390dfa7c46772907951744a4067dc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 31 Jul 2008 00:07:30 -0700 Subject: mm/hugetlb: don't crash when HPAGE_SHIFT is 0 Some platform decide whether they support huge pages at boot time. On these, such as powerpc, HPAGE_SHIFT is a variable, not a constant, and is set to 0 when there is no such support. The patches to introduce multiple huge pages support broke that causing the kernel to crash at boot time on machines such as POWER3 which lack support for multiple page sizes. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d237a02eb228..28a2980ee435 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1283,7 +1283,12 @@ module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - BUILD_BUG_ON(HPAGE_SHIFT == 0); + /* Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + if (HPAGE_SHIFT == 0) + return 0; if (!size_to_hstate(default_hstate_size)) { default_hstate_size = HPAGE_SIZE; -- cgit v1.2.2 From 3669bc143fb3d389918379547f4a6b28a757b7fe Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 1 Aug 2008 15:08:15 -0500 Subject: Remove EXPORTS of follow_page & zap_page_range Delete 2 EXPORTs that were accidentally sent upstream. Signed-off-by: Jack Steiner Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0e4eea10c7b0..6793b9c68107 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -993,7 +993,6 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, tlb_finish_mmu(tlb, address, end); return end; } -EXPORT_SYMBOL_GPL(zap_page_range); /** * zap_vma_ptes - remove ptes mapping the vma @@ -1111,7 +1110,6 @@ no_page_table: } return page; } -EXPORT_SYMBOL_GPL(follow_page); /* Can we do the FOLL_ANON optimization? */ static inline int use_zero_page(struct vm_area_struct *vma) -- cgit v1.2.2 From 84209e02de48d72289650cc5a7ae8dd18223620f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 1 Aug 2008 20:28:47 +0200 Subject: mm: dont clear PG_uptodate on truncate/invalidate Brian Wang reported that a FUSE filesystem exported through NFS could return I/O errors on read. This was traced to splice_direct_to_actor() returning a short or zero count when racing with page invalidation. However this is not FUSE or NFSD specific, other filesystems (notably NFS) also call invalidate_inode_pages2() to purge stale data from the cache. If this happens while such pages are sitting in a pipe buffer, then splice(2) from the pipe can return zero, and read(2) from the pipe can return ENODATA. The zero return is especially bad, since it implies end-of-file or disconnected pipe/socket, and is documented as such for splice. But returning an error for read() is also nasty, when in fact there was no error (data becoming stale is not an error). The same problems can be triggered by "hole punching" with madvise(MADV_REMOVE). Fix this by not clearing the PG_uptodate flag on truncation and invalidation. Signed-off-by: Miklos Szeredi Acked-by: Nick Piggin Cc: Andrew Morton Cc: Jens Axboe Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index e68443d74567..894e9a70699f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); remove_from_page_cache(page); - ClearPageUptodate(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ } @@ -356,7 +355,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); - ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; failed: -- cgit v1.2.2 From 1af446edfe3239b2b731f3458b3c285c397464cc Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 4 Aug 2008 16:01:47 +0900 Subject: nommu: Provide vmalloc_exec(). Now that SH has switched to vmalloc_exec() for PAGE_KERNEL_EXEC usage, it's apparent that nommu has no vmalloc_exec() definition of its own. Stub in the one from mm/vmalloc.c. Signed-off-by: Paul Mundt --- mm/nommu.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 5edccd9c9218..ed75bc962fbe 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size -- cgit v1.2.2 From a477097d9c37c1cf289c7f0257dffcfa42d50197 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 4 Aug 2008 13:41:14 -0700 Subject: mlock() fix return values Halesh says: Please find the below testcase provide to test mlock. Test Case : =========================== #include #include #include #include #include #include #include #include #include int main(void) { int fd,ret, i = 0; char *addr, *addr1 = NULL; unsigned int page_size; struct rlimit rlim; if (0 != geteuid()) { printf("Execute this pgm as root\n"); exit(1); } /* create a file */ if ((fd = open("mmap_test.c",O_RDWR|O_CREAT,0755)) == -1) { printf("cant create test file\n"); exit(1); } page_size = sysconf(_SC_PAGE_SIZE); /* set the MEMLOCK limit */ rlim.rlim_cur = 2000; rlim.rlim_max = 2000; if ((ret = setrlimit(RLIMIT_MEMLOCK,&rlim)) != 0) { printf("Cant change limit values\n"); exit(1); } addr = 0; while (1) { /* map a page into memory each time*/ if ((addr = (char *) mmap(addr,page_size, PROT_READ | PROT_WRITE,MAP_SHARED,fd,0)) == MAP_FAILED) { printf("cant do mmap on file\n"); exit(1); } if (0 == i) addr1 = addr; i++; errno = 0; /* lock the mapped memory pagewise*/ if ((ret = mlock((char *)addr, 1500)) == -1) { printf("errno value is %d\n", errno); printf("cant lock maped region\n"); exit(1); } addr = addr + page_size; } } ====================================================== This testcase results in an mlock() failure with errno 14 that is EFAULT, but it has nowhere been specified that mlock() will return EFAULT. When I tested the same on older kernels like 2.6.18, I got the correct result i.e errno 12 (ENOMEM). I think in source code mlock(2), setting errno ENOMEM has been missed in do_mlock() , on mlock_fixup() failure. SUSv3 requires the following behavior frmo mlock(2). [ENOMEM] Some or all of the address range specified by the addr and len arguments does not correspond to valid mapped pages in the address space of the process. [EAGAIN] Some or all of the memory identified by the operation could not be locked when the call was made. This rule isn't so nice and slighly strange. but many people think POSIX/SUS compliance is important. Reported-by: Halesh Sadashiv Tested-by: Halesh Sadashiv Signed-off-by: KOSAKI Motohiro Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 16 +++++++++++++--- mm/mlock.c | 2 -- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6793b9c68107..a472bcd4b061 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2765,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) - return -1; + return -ENOMEM; write = (vma->vm_flags & VM_WRITE) != 0; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); - if (ret < 0) + if (ret < 0) { + /* + SUS require strange return value to mlock + - invalid addr generate to ENOMEM. + - out of memory should generate EAGAIN. + */ + if (ret == -EFAULT) + ret = -ENOMEM; + else if (ret == -ENOMEM) + ret = -EAGAIN; return ret; - return ret == len ? 0 : -1; + } + return ret == len ? 0 : -ENOMEM; } #if !defined(__HAVE_ARCH_GATE_AREA) diff --git a/mm/mlock.c b/mm/mlock.c index 7b2656055d6a..01fbe93eff5c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -78,8 +78,6 @@ success: mm->locked_vm -= pages; out: - if (ret == -ENOMEM) - ret = -EAGAIN; return ret; } -- cgit v1.2.2 From 529ae9aaa08378cfe2a4350bded76f32cc8ff0ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 2 Aug 2008 12:01:03 +0200 Subject: mm: rename page trylock Converting page lock to new locking bitops requires a change of page flag operation naming, so we might as well convert it to something nicer (!TestSetPageLocked_Lock => trylock_page, SetPageLocked => set_page_locked). This also facilitates lockdeping of page lock. Signed-off-by: Nick Piggin Acked-by: KOSAKI Motohiro Acked-by: Peter Zijlstra Acked-by: Andrew Morton Acked-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 ++++++------ mm/memory.c | 2 +- mm/migrate.c | 4 ++-- mm/rmap.c | 2 +- mm/shmem.c | 4 ++-- mm/swap.c | 2 +- mm/swap_state.c | 8 ++++---- mm/swapfile.c | 2 +- mm/truncate.c | 4 ++-- mm/vmscan.c | 4 ++-- 10 files changed, 22 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index d97d1ad55473..54e968650855 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -558,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). + * test_and_set_bit() to lock the page; the second mb is necessary to enforce + * ordering between the clear_bit and the read of the waitqueue (to avoid SMP + * races with a parallel wait_on_page_locked()). */ void unlock_page(struct page *page) { smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) + if (!test_and_clear_bit(PG_locked, &page->flags)) BUG(); smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); @@ -931,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) struct page *page = find_get_page(mapping, index); if (page) { - if (!TestSetPageLocked(page)) + if (trylock_page(page)) return page; page_cache_release(page); return NULL; @@ -1027,7 +1027,7 @@ find_page: if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto page_not_up_to_date; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) diff --git a/mm/memory.c b/mm/memory.c index a472bcd4b061..1002f473f497 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1789,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (!TestSetPageLocked(old_page)) { + if (trylock_page(old_page)) { reuse = can_share_swap_page(old_page); unlock_page(old_page); } diff --git a/mm/migrate.c b/mm/migrate.c index 153572fb60b8..2a80136b23bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -605,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) * establishing additional references. We are the only one * holding a reference to the new page at this point. */ - if (TestSetPageLocked(newpage)) + if (!trylock_page(newpage)) BUG(); /* Prepare mapping for the new page.*/ @@ -667,7 +667,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); rc = -EAGAIN; - if (TestSetPageLocked(page)) { + if (!trylock_page(page)) { if (!force) goto move_newpage; lock_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index 94a5246a3f98..1ea4e6fcee77 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -422,7 +422,7 @@ int page_referenced(struct page *page, int is_locked, referenced += page_referenced_anon(page, mem_cont); else if (is_locked) referenced += page_referenced_file(page, mem_cont); - else if (TestSetPageLocked(page)) + else if (!trylock_page(page)) referenced++; else { if (page->mapping) diff --git a/mm/shmem.c b/mm/shmem.c index c1e5a3b4f758..04fb4f1ab88e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1265,7 +1265,7 @@ repeat: } /* We have to do this with page locked to prevent races */ - if (TestSetPageLocked(swappage)) { + if (!trylock_page(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); wait_on_page_locked(swappage); @@ -1329,7 +1329,7 @@ repeat: shmem_swp_unmap(entry); filepage = find_get_page(mapping, idx); if (filepage && - (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { + (!PageUptodate(filepage) || !trylock_page(filepage))) { spin_unlock(&info->lock); wait_on_page_locked(filepage); page_cache_release(filepage); diff --git a/mm/swap.c b/mm/swap.c index 7417a2adbe50..9e0cb3118079 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -444,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec) for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && !TestSetPageLocked(page)) { + if (PagePrivate(page) && trylock_page(page)) { if (PagePrivate(page)) try_to_release_page(page, 0); unlock_page(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index b8035b055129..167cf2dc8a03 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -201,7 +201,7 @@ void delete_from_swap_cache(struct page *page) */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !TestSetPageLocked(page)) { + if (PageSwapCache(page) && trylock_page(page)) { remove_exclusive_swap_page(page); unlock_page(page); } @@ -302,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ - SetPageLocked(new_page); + set_page_locked(new_page); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); - if (!err) { + if (likely(!err)) { /* * Initiate read into locked page and return. */ @@ -312,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, swap_readpage(NULL, new_page); return new_page; } - ClearPageLocked(new_page); + clear_page_locked(new_page); swap_free(entry); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index bb7f79641f9e..1e330f2998fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -403,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { page = find_get_page(&swapper_space, entry.val); - if (page && unlikely(TestSetPageLocked(page))) { + if (page && unlikely(!trylock_page(page))) { page_cache_release(page); page = NULL; } diff --git a/mm/truncate.c b/mm/truncate.c index 894e9a70699f..250505091d37 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -187,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (page_index > next) next = page_index; next++; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) continue; if (PageWriteback(page)) { unlock_page(page); @@ -280,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t index; int lock_failed; - lock_failed = TestSetPageLocked(page); + lock_failed = !trylock_page(page); /* * We really shouldn't be looking at the ->index of an diff --git a/mm/vmscan.c b/mm/vmscan.c index 75be453628bf..1ff1a58e7c10 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -496,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, page = lru_to_page(page_list); list_del(&page->lru); - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; VM_BUG_ON(PageActive(page)); @@ -582,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; if (PageDirty(page) || PageWriteback(page)) goto keep_locked; -- cgit v1.2.2 From 5595cffc8248e4672c5803547445e85e4053c8fc Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 5 Aug 2008 09:28:47 +0300 Subject: SLUB: dynamic per-cache MIN_PARTIAL This patch changes the static MIN_PARTIAL to a dynamic per-cache ->min_partial value that is calculated from object size. The bigger the object size, the more pages we keep on the partial list. I tested SLAB, SLUB, and SLUB with this patch on Jens Axboe's 'netio' example script of the fio benchmarking tool. The script stresses the networking subsystem which should also give a fairly good beating of kmalloc() et al. To run the test yourself, first clone the fio repository: git clone git://git.kernel.dk/fio.git and then run the following command n times on your machine: time ./fio examples/netio The results on my 2-way 64-bit x86 machine are as follows: [ the minimum, maximum, and average are captured from 50 individual runs ] real time (seconds) min max avg sd SLAB 22.76 23.38 22.98 0.17 SLUB 22.80 25.78 23.46 0.72 SLUB (dynamic) 22.74 23.54 23.00 0.20 sys time (seconds) min max avg sd SLAB 6.90 8.28 7.70 0.28 SLUB 7.42 16.95 8.89 2.28 SLUB (dynamic) 7.17 8.64 7.73 0.29 user time (seconds) min max avg sd SLAB 36.89 38.11 37.50 0.29 SLUB 30.85 37.99 37.06 1.67 SLUB (dynamic) 36.75 38.07 37.59 0.32 As you can see from the above numbers, this patch brings SLUB to the same level as SLAB for this particular workload fixing a ~2% regression. I'd expect this change to help similar workloads that allocate a lot of objects that are close to the size of a page. Cc: Matthew Wilcox Cc: Andrew Morton Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index c26d4c36fba9..4f5b96149458 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > MIN_PARTIAL) { + n->nr_partial > n->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < MIN_PARTIAL) { + if (n->nr_partial < n->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1913,9 +1913,21 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, #endif } -static void init_kmem_cache_node(struct kmem_cache_node *n) +static void +init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; + + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + n->min_partial = ilog2(s->size); + if (n->min_partial < MIN_PARTIAL) + n->min_partial = MIN_PARTIAL; + else if (n->min_partial > MAX_PARTIAL) + n->min_partial = MAX_PARTIAL; + spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG @@ -2087,7 +2099,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_object(kmalloc_caches, n, 1); init_tracking(kmalloc_caches, n); #endif - init_kmem_cache_node(n); + init_kmem_cache_node(n, kmalloc_caches); inc_slabs_node(kmalloc_caches, node, page->objects); /* @@ -2144,7 +2156,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } s->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); } return 1; } @@ -2155,7 +2167,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { - init_kmem_cache_node(&s->local_node); + init_kmem_cache_node(&s->local_node, s); return 1; } #endif @@ -2889,7 +2901,7 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); s->node[nid] = n; } out: -- cgit v1.2.2 From 5c9ffc9c3d61dfcafd7cdb61c7b94f2d7ac408fb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 5 Aug 2008 13:01:01 -0700 Subject: mm_init.c: avoid ifdef-inside-macro-expansion gcc-3.2: mm/mm_init.c:77:1: directives may not be used inside a macro argument mm/mm_init.c:76:47: unterminated argument list invoking macro "mminit_dprintk" mm/mm_init.c: In function `mminit_verify_pageflags_layout': mm/mm_init.c:80: `mminit_dprintk' undeclared (first use in this function) mm/mm_init.c:80: (Each undeclared identifier is reported only once mm/mm_init.c:80: for each function it appears in.) mm/mm_init.c:80: syntax error before numeric constant Also fix a typo in a comment. Reported-by: Adrian Bunk Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mm_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index c6af41ea9994..936ef2efd892 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -14,6 +14,10 @@ #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; +#ifndef SECTIONS_SHIFT +#define SECTIONS_SHIFT 0 +#endif + /* The zonelists are simply reported, validation is manual. */ void mminit_verify_zonelist(void) { @@ -74,11 +78,7 @@ void __init mminit_verify_pageflags_layout(void) NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", "Section %d Node %d Zone %d\n", -#ifdef SECTIONS_SHIFT SECTIONS_SHIFT, -#else - 0, -#endif NODES_SHIFT, ZONES_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", -- cgit v1.2.2 From dfe195fb79e88c334481f1362fef52f6d2e30b2d Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 5 Aug 2008 13:01:41 -0700 Subject: mm: fix uninitialized variables for find_vma_prepare callers gcc 4.3.0 correctly emits the following warnings. When a vma covering addr is found, find_vma_prepare indeed returns without setting pprev, rb_link, and rb_parent. mm/mmap.c: In function `insert_vm_struct': mm/mmap.c:2085: warning: `rb_parent' may be used uninitialized in this function mm/mmap.c:2085: warning: `rb_link' may be used uninitialized in this function mm/mmap.c:2084: warning: `prev' may be used uninitialized in this function mm/mmap.c: In function `copy_vma': mm/mmap.c:2124: warning: `rb_parent' may be used uninitialized in this function mm/mmap.c:2124: warning: `rb_link' may be used uninitialized in this function mm/mmap.c:2123: warning: `prev' may be used uninitialized in this function mm/mmap.c: In function `do_brk': mm/mmap.c:1951: warning: `rb_parent' may be used uninitialized in this function mm/mmap.c:1951: warning: `rb_link' may be used uninitialized in this function mm/mmap.c:1949: warning: `prev' may be used uninitialized in this function mm/mmap.c: In function `mmap_region': mm/mmap.c:1092: warning: `rb_parent' may be used uninitialized in this function mm/mmap.c:1092: warning: `rb_link' may be used uninitialized in this function mm/mmap.c:1089: warning: `prev' may be used uninitialized in this function Hugh adds: in fact, none of find_vma_prepare's callers use those values when a vma is found to be already covering addr, it's either an error or an occasion to munmap and repeat. Okay, let's quieten the compiler (but I would prefer it if pprev, rb_link and rb_parent were meaningful in that case, rather than whatever's in them from descending the tree). Signed-off-by: Benny Halevy Signed-off-by: Hugh Dickins Cc: "Ryan Hope" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 245c3d69067b..971d0eda754a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -370,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, if (vma_tmp->vm_end > addr) { vma = vma_tmp; if (vma_tmp->vm_start <= addr) - return vma; + break; __rb_link = &__rb_parent->rb_left; } else { rb_prev = __rb_parent; -- cgit v1.2.2 From d6606683a5e3dac35cb979c7195f54ed827567bd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 Aug 2008 12:04:54 -0700 Subject: Revert duplicate "mm/hugetlb.c must #include " This reverts commit 7cb93181629c613ee2b8f4ffe3446f8003074842, since we did that patch twice, and the problem was already fixed earlier by 78a34ae29bf1c9df62a5bd0f0798b6c62a54d520. Reported-by: Andi Kleen Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28a2980ee435..757ca983fd99 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -17,7 +17,7 @@ #include #include #include -#include + #include #include #include -- cgit v1.2.2 From 454ed842d55740160334efc9ad56cfef54ed37bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:25 +0200 Subject: lockdep: annotate mm_take_all_locks() The nesting is correct due to holding mmap_sem, use the new annotation to annotate this. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 245c3d69067b..5d09d08a4120 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2273,14 +2273,14 @@ int install_special_mapping(struct mm_struct *mm, static DEFINE_MUTEX(mm_all_locks_mutex); -static void vm_lock_anon_vma(struct anon_vma *anon_vma) +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock(&anon_vma->lock); + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->lock. If some other vma in this mm shares @@ -2296,7 +2296,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma) } } -static void vm_lock_mapping(struct address_space *mapping) +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* @@ -2310,7 +2310,7 @@ static void vm_lock_mapping(struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock(&mapping->i_mmap_lock); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); } } @@ -2359,9 +2359,9 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(vma->anon_vma); + vm_lock_anon_vma(mm, vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) - vm_lock_mapping(vma->vm_file->f_mapping); + vm_lock_mapping(mm, vma->vm_file->f_mapping); } ret = 0; -- cgit v1.2.2 From 7cd5a02f54f4c9d16cf7fdffa2122bc73bb09b43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:25 +0200 Subject: mm: fix mm_take_all_locks() locking order Lockdep spotted: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.27-rc1 #270 ------------------------------------------------------- qemu-kvm/2033 is trying to acquire lock: (&inode->i_data.i_mmap_lock){----}, at: [] mm_take_all_locks+0xc2/0xea but task is already holding lock: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&anon_vma->lock){----}: [] __lock_acquire+0x11be/0x14d2 [] lock_acquire+0x5e/0x7a [] _spin_lock+0x3b/0x47 [] vma_adjust+0x200/0x444 [] split_vma+0x12f/0x146 [] mprotect_fixup+0x13c/0x536 [] sys_mprotect+0x1a9/0x21e [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff -> #0 (&inode->i_data.i_mmap_lock){----}: [] __lock_acquire+0xedb/0x14d2 [] lock_release_non_nested+0x1c2/0x219 [] lock_release+0x127/0x14a [] _spin_unlock+0x1e/0x50 [] mm_drop_all_locks+0x7f/0xb0 [] do_mmu_notifier_register+0xe2/0x112 [] mmu_notifier_register+0xe/0x10 [] kvm_dev_ioctl+0x11e/0x287 [kvm] [] vfs_ioctl+0x2a/0x78 [] do_vfs_ioctl+0x257/0x274 [] sys_ioctl+0x55/0x78 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff other info that might help us debug this: 5 locks held by qemu-kvm/2033: #0: (&mm->mmap_sem){----}, at: [] do_mmu_notifier_register+0x55/0x112 #1: (mm_all_locks_mutex){--..}, at: [] mm_take_all_locks+0x34/0xea #2: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea #3: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea #4: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea stack backtrace: Pid: 2033, comm: qemu-kvm Not tainted 2.6.27-rc1 #270 Call Trace: [] print_circular_bug_tail+0xb8/0xc3 [] __lock_acquire+0xedb/0x14d2 [] ? add_lock_to_list+0x7e/0xad [] ? mm_take_all_locks+0x70/0xea [] ? mm_take_all_locks+0x70/0xea [] lock_release_non_nested+0x1c2/0x219 [] ? mm_take_all_locks+0xc2/0xea [] ? mm_take_all_locks+0xc2/0xea [] ? trace_hardirqs_on_caller+0x4d/0x115 [] ? mm_drop_all_locks+0x7f/0xb0 [] lock_release+0x127/0x14a [] _spin_unlock+0x1e/0x50 [] mm_drop_all_locks+0x7f/0xb0 [] do_mmu_notifier_register+0xe2/0x112 [] mmu_notifier_register+0xe/0x10 [] kvm_dev_ioctl+0x11e/0x287 [kvm] [] ? file_has_perm+0x83/0x8e [] vfs_ioctl+0x2a/0x78 [] do_vfs_ioctl+0x257/0x274 [] sys_ioctl+0x55/0x78 [] system_call_fastpath+0x16/0x1b Which the locking hierarchy in mm/rmap.c confirms as valid. Fix this by first taking all the mapping->i_mmap_lock instances and then take all anon_vma->lock instances. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- mm/mmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5d09d08a4120..32a287b631d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2358,11 +2358,17 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_lock_mapping(mm, vma->vm_file->f_mapping); } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); + } + ret = 0; out_unlock: -- cgit v1.2.2 From 912985dce45ef18fcdd9f5439fef054e0e22302a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Aug 2008 17:52:52 -0500 Subject: mm: Make generic weak get_user_pages_fast and EXPORT_GPL it Out of line get_user_pages_fast fallback implementation, make it a weak symbol, get rid of CONFIG_HAVE_GET_USER_PAGES_FAST. Export the symbol to modules so lguest can use it. Signed-off-by: Nick Piggin Signed-off-by: Rusty Russell --- mm/Kconfig | 3 --- mm/util.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 446c6588c753..0bd9c2dbb2a0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -config HAVE_GET_USER_PAGES_FAST - bool - # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows diff --git a/mm/util.c b/mm/util.c index 9341ca77bd88..cb00b748ce47 100644 --- a/mm/util.c +++ b/mm/util.c @@ -171,3 +171,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->unmap_area = arch_unmap_area; } #endif + +int __attribute__((weak)) get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, nr_pages, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); -- cgit v1.2.2 From caff3a2c333e11a794308bd9a875a09b94fee24a Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 12 Aug 2008 15:08:38 -0700 Subject: hugetlb: call arch_prepare_hugepage() for surplus pages The s390 software large page emulation implements shared page tables by using page->index of the first tail page from a compound large page to store page table information. This is set up in arch_prepare_hugepage(), which is called from alloc_fresh_huge_page_node(). A similar call to arch_prepare_hugepage() is missing for surplus large pages that are allocated in alloc_buddy_huge_page(), which breaks the software emulation mode for (surplus) large pages on s390. This patch adds the missing call to arch_prepare_hugepage(). It will have no effect on other architectures where arch_prepare_hugepage() is a nop. Also, use the correct order in the error path in alloc_fresh_huge_page_node(). Acked-by: Martin Schwidefsky Signed-off-by: Gerald Schaefer Acked-by: Nick Piggin Acked-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd99..92155db888b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* -- cgit v1.2.2 From 74768ed833344bb0f82b97cee46320a3d7f09ecd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 12 Aug 2008 15:08:39 -0700 Subject: page allocator: use no-panic variant of alloc_bootmem() in alloc_large_system_hash() .. since a failed allocation is being (initially) handled gracefully, and panic()-ed upon failure explicitly in the function if retries with smaller sizes failed. Signed-off-by: Jan Beulich Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 401d104d2bb6..af982f7cdb2a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4437,7 +4437,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem(size); + table = alloc_bootmem_nopanic(size); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { -- cgit v1.2.2 From 9623e078c1f4692a91531af2f639ec8aff8f0472 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 12 Aug 2008 15:08:41 -0700 Subject: memcg: fix oops in mem_cgroup_shrink_usage Got an oops in mem_cgroup_shrink_usage() when testing loop over tmpfs: yes, of course, loop0 has no mm: other entry points check but this didn't. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7056c3bdb478..0f1f7a7374ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -796,6 +796,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) if (mem_cgroup_subsys.disabled) return 0; + if (!mm) + return 0; rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); -- cgit v1.2.2 From 57303d80175e10056bf51206f9961d586f02f967 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:47 -0700 Subject: hugetlbfs: allocate structures for reservation tracking outside of spinlocks In the normal case, hugetlbfs reserves hugepages at map time so that the pages exist for future faults. A struct file_region is used to track when reservations have been consumed and where. These file_regions are allocated as necessary with kmalloc() which can sleep with the mm->page_table_lock held. This is wrong and triggers may-sleep warning when PREEMPT is enabled. Updates to the underlying file_region are done in two phases. The first phase prepares the region for the change, allocating any necessary memory, without actually making the change. The second phase actually commits the change. This patch makes use of this by checking the reservations before the page_table_lock is taken; triggering any necessary allocations. This may then be safely repeated within the locks without any allocations being required. Credit to Mel Gorman for diagnosing this failure and initial versions of the patch. Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92155db888b9..4c97c174e2e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1942,6 +1942,15 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + vma_needs_reservation(h, vma, address); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1978,6 +1987,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2000,19 +2010,35 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + vma_needs_reservation(h, vma, address); + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.2 From 2b26736c88db85c038e04c2306d0745553e69602 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:49 -0700 Subject: allocate structures for reservation tracking in hugetlbfs outside of spinlocks v2 [Andrew this should replace the previous version which did not check the returns from the region prepare for errors. This has been tested by us and Gerald and it looks good. Bah, while reviewing the locking based on your previous email I spotted that we need to check the return from the vma_needs_reservation call for allocation errors. Here is an updated patch to correct this. This passes testing here.] Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c97c174e2e1..67a71191136e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1949,7 +1949,10 @@ retry: * the spinlock. */ if (write_access && !(vma->vm_flags & VM_SHARED)) - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); @@ -1976,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -2004,8 +2008,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; @@ -2019,7 +2022,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if (write_access && !pte_write(entry)) { - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } if (!(vma->vm_flags & VM_SHARED)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -2039,6 +2045,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.2 From d6bf73e4340f52159c1d9f13836b62e20fcd12d3 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 12 Aug 2008 15:08:52 -0700 Subject: do_migrate_pages(): remove unused variable Signed-off-by: MinChan Kim Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e550bec20582..83369058ec13 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { - LIST_HEAD(pagelist); int busy = 0; int err = 0; nodemask_t tmp; -- cgit v1.2.2 From fc1efbdb7a1175759b099d74b67921396e5e8e3d Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Tue, 12 Aug 2008 15:09:00 -0700 Subject: mm/sparse.c: removed duplicated include Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 5d9dbbb9d39e..39db301b920d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -12,7 +12,6 @@ #include #include #include -#include "internal.h" /* * Permanent SPARSEMEM data: -- cgit v1.2.2