Merge ../linus

author: Dave Jones <davej@redhat.com> 2006-09-05 17:20:21 -0400
committer: Dave Jones <davej@redhat.com> 2006-09-05 17:20:21 -0400
commit: 115b384cf87249d76adb0b21aca11ee22128927d (patch)
tree: f39a2a54863e9d82d1196906f92c82ab5991c6af /mm
parent: 8eb7925f93af75e66a240d148efdec212f95bcb7 (diff)
parent: c336923b668fdcf0312efbec3b44895d713f4d81 (diff)
9 files changed, 196 insertions, 50 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 60a5d55e51d9..168c78a121bb 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                file->f_ra.ra_pages = bdi->ra_pages * 2;
                break;
        case POSIX_FADV_WILLNEED:
-        case POSIX_FADV_NOREUSE:
                if (!mapping->a_ops->readpage) {
                        ret = -EINVAL;
                        break;
@@ -94,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                if (ret > 0)
                        ret = 0;
                break;
+        case POSIX_FADV_NOREUSE:
+                break;
        case POSIX_FADV_DONTNEED:
                if (!bdi_write_congested(mapping->backing_dev_info))
                        filemap_flush(mapping);
diff --git a/mm/filemap.c b/mm/filemap.c
index d087fc3d3281..b9a60c43b61a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp,
                return;
        ra->ra_pages /= 4;
-        printk(KERN_WARNING "Reducing readahead size to %luK\n",
-                        ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
 }
 /**
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 01c9fb97c619..c37319542b70 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
        int nr_pages = PAGES_PER_SECTION;
        int ret;
+        if (pfn_valid(phys_start_pfn))
+                return -EEXIST;
        ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
        if (ret < 0)
@@ -76,15 +79,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 {
        unsigned long i;
        int err = 0;
+        int start_sec, end_sec;
+        /* during initialize mem_map, align hot-added range to section */
+        start_sec = pfn_to_section_nr(phys_start_pfn);
+        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-        for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+        for (i = start_sec; i <= end_sec; i++) {
-                err = __add_section(zone, phys_start_pfn + i);
+                err = __add_section(zone, i << PFN_SECTION_SHIFT);
-                /* We want to keep adding the rest of the
+                /*
-                 * sections if the first ones already exist
+                 * EEXIST is finally dealed with by ioresource collision
+                 * check. see add_memory() => register_memory_resource()
+                 * Warning will be printed if there is collision.
                 */
                if (err && (err != -EEXIST))
                        break;
+                err = 0;
        }
        return err;
@@ -156,7 +166,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        res.flags = IORESOURCE_MEM; /* we just need system ram */
        section_end = res.end;
-        while (find_next_system_ram(&res) >= 0) {
+        while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
                start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
                nr_pages = (unsigned long)
                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
@@ -213,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 /* add this memory to iomem resource */
-static void register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size)
 {
        struct resource *res;
        res = kzalloc(sizeof(struct resource), GFP_KERNEL);
        BUG_ON(!res);
@@ -228,7 +237,18 @@ static void register_memory_resource(u64 start, u64 size)
                printk("System RAM resource %llx - %llx cannot be added\n",
                (unsigned long long)res->start, (unsigned long long)res->end);
                kfree(res);
+                res = NULL;
        }
+        return res;
+}
+static void release_memory_resource(struct resource *res)
+{
+        if (!res)
+                return;
+        release_resource(res);
+        kfree(res);
+        return;
 }
@@ -237,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
        int new_pgdat = 0;
+        struct resource *res;
        int ret;
+        res = register_memory_resource(start, size);
+        if (!res)
+                return -EEXIST;
        if (!node_online(nid)) {
                pgdat = hotadd_new_pgdat(nid, start);
                if (!pgdat)
@@ -268,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size)
                BUG_ON(ret);
        }
-        /* register this memory as resource */
-        register_memory_resource(start, size);
        return ret;
 error:
        /* rollback pgdat allocation and others */
        if (new_pgdat)
                rollback_node_hotadd(nid, pgdat);
+        if (res)
+                release_memory_resource(res);
        return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e07e27e846a2..a9963ceddd65 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
        if (vma) {
                unsigned long off;
-                off = vma->vm_pgoff;
+                /*
+                 * for small pages, there is no difference between
+                 * shift and PAGE_SHIFT, so the bit-shift is safe.
+                 * for huge pages, since vm_pgoff is in units of small
+                 * pages, we need to shift off the always 0 bits to get
+                 * a useful offset.
+                 */
+                BUG_ON(shift < PAGE_SHIFT);
+                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
                off += (addr - vma->vm_start) >> shift;
                return offset_il_node(pol, vma, off);
        } else
diff --git a/mm/mempool.c b/mm/mempool.c
index fe6e05289cc5..ccd8cb8cd41f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
        init_wait(&wait);
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
        smp_mb();
-        if (!pool->curr_nr)
+        if (!pool->curr_nr) {
-                io_schedule();
+                /*
+                 * FIXME: this should be io_schedule().  The timeout is there
+                 * as a workaround for some DM problems in 2.6.18.
+                 */
+                io_schedule_timeout(5*HZ);
+        }
        finish_wait(&pool->wait, &wait);
        goto repeat_alloc;
diff --git a/mm/slab.c b/mm/slab.c
index 0f20843beffd..21ba06035700 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 #endif
-static int __devinit cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -3224,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 /**
- * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
 * @cache: The cache to allocate from.
 * @flags: See kmalloc().
 *
diff --git a/mm/swap.c b/mm/swap.c
index 8fd095c4ae51..687686a61f7c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,6 +54,26 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
+/**
+ * put_pages_list(): release a list of pages
+ *
+ * Release a list of pages which are strung together on page.lru.  Currently
+ * used by read_cache_pages() and related error recovery code.
+ *
+ * @pages: list of pages threaded on page->lru
+ */
+void put_pages_list(struct list_head *pages)
+{
+        while (!list_empty(pages)) {
+                struct page *victim;
+                victim = list_entry(pages->prev, struct page, lru);
+                list_del(&victim->lru);
+                page_cache_release(victim);
+        }
+}
+EXPORT_SYMBOL(put_pages_list);
 /*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e70d6c6d6fee..f1f5ec783781 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
                if (!(swap_info[i].flags & SWP_WRITEOK))
                        continue;
                if (!device) {
                        spin_unlock(&swap_lock);
                        return i;
                }
-                inode = swap_info->swap_file->f_dentry->d_inode;
+                inode = swap_info[i].swap_file->f_dentry->d_inode;
                if (S_ISBLK(inode->i_mode) &&
                    device == MKDEV(imajor(inode), iminor(inode))) {
                        spin_unlock(&swap_lock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index dfdf24133901..c1b5f4106b38 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
                        unsigned long *free, struct pglist_data *pgdat)
@@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
-#define STAT_THRESHOLD 32
+static int calculate_threshold(struct zone *zone)
+{
+        int threshold;
+        int mem;        /* memory in 128 MB units */
+        /*
+         * The threshold scales with the number of processors and the amount
+         * of memory per zone. More memory means that we can defer updates for
+         * longer, more processors could lead to more contention.
+         * fls() is used to have a cheap way of logarithmic scaling.
+         *
+         * Some sample thresholds:
+         *
+         * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
+         * ------------------------------------------------------------------
+         * 8            1               1       0.9-1 GB        4
+         * 16           2               2       0.9-1 GB        4
+         * 20           2               2       1-2 GB          5
+         * 24           2               2       2-4 GB          6
+         * 28           2               2       4-8 GB          7
+         * 32           2               2       8-16 GB         8
+         * 4            2               2       <128M           1
+         * 30           4               3       2-4 GB          5
+         * 48           4               3       8-16 GB         8
+         * 32           8               4       1-2 GB          4
+         * 32           8               4       0.9-1GB         4
+         * 10           16              5       <128M           1
+         * 40           16              5       900M            4
+         * 70           64              7       2-4 GB          5
+         * 84           64              7       4-8 GB          6
+         * 108          512             9       4-8 GB          6
+         * 125          1024            10      8-16 GB         8
+         * 125          1024            10      16-32 GB        9
+         */
+        mem = zone->present_pages >> (27 - PAGE_SHIFT);
+        threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
+        /*
+         * Maximum threshold is 125
+         */
+        threshold = min(125, threshold);
+        return threshold;
+}
 /*
- * Determine pointer to currently valid differential byte given a zone and
+ * Refresh the thresholds for each zone.
- * the item number.
- *
- * Preemption must be off
 */
-static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+static void refresh_zone_stat_thresholds(void)
 {
-        return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+        struct zone *zone;
+        int cpu;
+        int threshold;
+        for_each_zone(zone) {
+                if (!zone->present_pages)
+                        continue;
+                threshold = calculate_threshold(zone);
+                for_each_online_cpu(cpu)
+                        zone_pcp(zone, cpu)->stat_threshold = threshold;
+        }
 }
 /*
@@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                                int delta)
 {
-        s8 *p;
+        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        s8 *p = pcp->vm_stat_diff + item;
        long x;
-        p = diff_pointer(zone, item);
        x = delta + *p;
-        if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+        if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
                zone_page_state_add(x, zone, item);
                x = 0;
        }
        *p = x;
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
@@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
 * No overflow check is necessary and therefore the differential can be
 * incremented or decremented in place which may allow the compilers to
 * generate better code.
- *
 * The increment or decrement is known and therefore one boundary check can
 * be omitted.
 *
+ * NOTE: These functions are very performance sensitive. Change only
+ * with care.
+ *
 * Some processors have inc/dec instructions that are atomic vs an interrupt.
 * However, the code must first determine the differential location in a zone
 * based on the processor number and then inc/dec the counter. There is no
@@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
 */
 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        s8 *p = diff_pointer(zone, item);
+        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        s8 *p = pcp->vm_stat_diff + item;
        (*p)++;
-        if (unlikely(*p > STAT_THRESHOLD)) {
+        if (unlikely(*p > pcp->stat_threshold)) {
-                zone_page_state_add(*p, zone, item);
+                int overstep = pcp->stat_threshold / 2;
-                *p = 0;
+                zone_page_state_add(*p + overstep, zone, item);
+                *p = -overstep;
        }
 }
@@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        struct zone *zone = page_zone(page);
-        s8 *p = diff_pointer(zone, item);
+        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        s8 *p = pcp->vm_stat_diff + item;
        (*p)--;
-        if (unlikely(*p < -STAT_THRESHOLD)) {
+        if (unlikely(*p < - pcp->stat_threshold)) {
-                zone_page_state_add(*p, zone, item);
+                int overstep = pcp->stat_threshold / 2;
-                *p = 0;
+                zone_page_state_add(*p - overstep, zone, item);
+                *p = overstep;
        }
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
@@ -239,19 +302,9 @@ EXPORT_SYMBOL(inc_zone_page_state);
 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        unsigned long flags;
-        struct zone *zone;
-        s8 *p;
-        zone = page_zone(page);
        local_irq_save(flags);
-        p = diff_pointer(zone, item);
+        __dec_zone_page_state(page, item);
-        (*p)--;
-        if (unlikely(*p < -STAT_THRESHOLD)) {
-                zone_page_state_add(*p, zone, item);
-                *p = 0;
-        }
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
@@ -525,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                                           pageset->pcp[j].high,
                                           pageset->pcp[j].batch);
                        }
+#ifdef CONFIG_SMP
+                        seq_printf(m, "\n  vm stats threshold: %d",
+                                        pageset->stat_threshold);
+#endif
                }
                seq_printf(m,
                           "\n  all_unreclaimable: %u"
@@ -613,3 +670,35 @@ struct seq_operations vmstat_op = {
 #endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the thresholds are recalculated
+ * when necessary.
+ */
+static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+                unsigned long action,
+                void *hcpu)
+{
+        switch (action) {
+                case CPU_UP_PREPARE:
+                case CPU_UP_CANCELED:
+                case CPU_DEAD:
+                        refresh_zone_stat_thresholds();
+                        break;
+                default:
+                        break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata vmstat_notifier =
+        { &vmstat_cpuup_callback, NULL, 0 };
+int __init setup_vmstat(void)
+{
+        refresh_zone_stat_thresholds();
+        register_cpu_notifier(&vmstat_notifier);
+        return 0;
+}
+module_init(setup_vmstat)
+#endif
author	Dave Jones <davej@redhat.com>	2006-09-05 17:20:21 -0400
committer	Dave Jones <davej@redhat.com>	2006-09-05 17:20:21 -0400
commit	115b384cf87249d76adb0b21aca11ee22128927d (patch)
tree	f39a2a54863e9d82d1196906f92c82ab5991c6af /mm
parent	8eb7925f93af75e66a240d148efdec212f95bcb7 (diff)
parent	c336923b668fdcf0312efbec3b44895d713f4d81 (diff)

diff --git a/mm/fadvise.c b/mm/fadvise.c index 60a5d55e51d9..168c78a121bb 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c
@@ -73,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
73	file->f_ra.ra_pages = bdi->ra_pages * 2;	73	file->f_ra.ra_pages = bdi->ra_pages * 2;
74	break;	74	break;
75	case POSIX_FADV_WILLNEED:	75	case POSIX_FADV_WILLNEED:
76	case POSIX_FADV_NOREUSE:
77	if (!mapping->a_ops->readpage) {	76	if (!mapping->a_ops->readpage) {
78	ret = -EINVAL;	77	ret = -EINVAL;
79	break;	78	break;
@@ -94,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
94	if (ret > 0)	93	if (ret > 0)
95	ret = 0;	94	ret = 0;
96	break;	95	break;
		96	case POSIX_FADV_NOREUSE:
		97	break;
97	case POSIX_FADV_DONTNEED:	98	case POSIX_FADV_DONTNEED:
98	if (!bdi_write_congested(mapping->backing_dev_info))	99	if (!bdi_write_congested(mapping->backing_dev_info))
99	filemap_flush(mapping);	100	filemap_flush(mapping);


diff --git a/mm/filemap.c b/mm/filemap.c index d087fc3d3281..b9a60c43b61a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp,
849	return;	849	return;
850		850
851	ra->ra_pages /= 4;	851	ra->ra_pages /= 4;
852	printk(KERN_WARNING "Reducing readahead size to %luK\n",
853	ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
854	}	852	}
855		853
856	/**	854	/**


diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01c9fb97c619..c37319542b70 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c
@@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
52	int nr_pages = PAGES_PER_SECTION;	52	int nr_pages = PAGES_PER_SECTION;
53	int ret;	53	int ret;
54		54
		55	if (pfn_valid(phys_start_pfn))
		56	return -EEXIST;
		57
55	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);	58	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
56		59
57	if (ret < 0)	60	if (ret < 0)
@@ -76,15 +79,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
76	{	79	{
77	unsigned long i;	80	unsigned long i;
78	int err = 0;	81	int err = 0;
		82	int start_sec, end_sec;
		83	/* during initialize mem_map, align hot-added range to section */
		84	start_sec = pfn_to_section_nr(phys_start_pfn);
		85	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
79		86
80	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {	87	for (i = start_sec; i <= end_sec; i++) {
81	err = __add_section(zone, phys_start_pfn + i);	88	err = __add_section(zone, i << PFN_SECTION_SHIFT);
82		89
83	/* We want to keep adding the rest of the	90	/*
84	* sections if the first ones already exist	91	* EEXIST is finally dealed with by ioresource collision
		92	* check. see add_memory() => register_memory_resource()
		93	* Warning will be printed if there is collision.
85	*/	94	*/
86	if (err && (err != -EEXIST))	95	if (err && (err != -EEXIST))
87	break;	96	break;
		97	err = 0;
88	}	98	}
89		99
90	return err;	100	return err;
@@ -156,7 +166,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
156	res.flags = IORESOURCE_MEM; /* we just need system ram */	166	res.flags = IORESOURCE_MEM; /* we just need system ram */
157	section_end = res.end;	167	section_end = res.end;
158		168
159	while (find_next_system_ram(&res) >= 0) {	169	while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
160	start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);	170	start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
161	nr_pages = (unsigned long)	171	nr_pages = (unsigned long)
162	((res.end + 1 - res.start) >> PAGE_SHIFT);	172	((res.end + 1 - res.start) >> PAGE_SHIFT);
@@ -213,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
213	}	223	}
214		224
215	/* add this memory to iomem resource */	225	/* add this memory to iomem resource */
216	static void register_memory_resource(u64 start, u64 size)	226	static struct resource *register_memory_resource(u64 start, u64 size)
217	{	227	{
218	struct resource *res;	228	struct resource *res;
219
220	res = kzalloc(sizeof(struct resource), GFP_KERNEL);	229	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
221	BUG_ON(!res);	230	BUG_ON(!res);
222		231
@@ -228,7 +237,18 @@ static void register_memory_resource(u64 start, u64 size)
228	printk("System RAM resource %llx - %llx cannot be added\n",	237	printk("System RAM resource %llx - %llx cannot be added\n",
229	(unsigned long long)res->start, (unsigned long long)res->end);	238	(unsigned long long)res->start, (unsigned long long)res->end);
230	kfree(res);	239	kfree(res);
		240	res = NULL;
231	}	241	}
		242	return res;
		243	}
		244
		245	static void release_memory_resource(struct resource *res)
		246	{
		247	if (!res)
		248	return;
		249	release_resource(res);
		250	kfree(res);
		251	return;
232	}	252	}
233		253
234		254
@@ -237,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size)
237	{	257	{
238	pg_data_t *pgdat = NULL;	258	pg_data_t *pgdat = NULL;
239	int new_pgdat = 0;	259	int new_pgdat = 0;
		260	struct resource *res;
240	int ret;	261	int ret;
241		262
		263	res = register_memory_resource(start, size);
		264	if (!res)
		265	return -EEXIST;
		266
242	if (!node_online(nid)) {	267	if (!node_online(nid)) {
243	pgdat = hotadd_new_pgdat(nid, start);	268	pgdat = hotadd_new_pgdat(nid, start);
244	if (!pgdat)	269	if (!pgdat)
@@ -268,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size)
268	BUG_ON(ret);	293	BUG_ON(ret);
269	}	294	}
270		295
271	/* register this memory as resource */
272	register_memory_resource(start, size);
273
274	return ret;	296	return ret;
275	error:	297	error:
276	/* rollback pgdat allocation and others */	298	/* rollback pgdat allocation and others */
277	if (new_pgdat)	299	if (new_pgdat)
278	rollback_node_hotadd(nid, pgdat);	300	rollback_node_hotadd(nid, pgdat);
		301	if (res)
		302	release_memory_resource(res);
279		303
280	return ret;	304	return ret;
281	}	305	}


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e07e27e846a2..a9963ceddd65 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1176	if (vma) {	1176	if (vma) {
1177	unsigned long off;	1177	unsigned long off;
1178		1178
1179	off = vma->vm_pgoff;	1179	/*
		1180	* for small pages, there is no difference between
		1181	* shift and PAGE_SHIFT, so the bit-shift is safe.
		1182	* for huge pages, since vm_pgoff is in units of small
		1183	* pages, we need to shift off the always 0 bits to get
		1184	* a useful offset.
		1185	*/
		1186	BUG_ON(shift < PAGE_SHIFT);
		1187	off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1180	off += (addr - vma->vm_start) >> shift;	1188	off += (addr - vma->vm_start) >> shift;
1181	return offset_il_node(pol, vma, off);	1189	return offset_il_node(pol, vma, off);
1182	} else	1190	} else


diff --git a/mm/mempool.c b/mm/mempool.c index fe6e05289cc5..ccd8cb8cd41f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
238	init_wait(&wait);	238	init_wait(&wait);
239	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);	239	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
240	smp_mb();	240	smp_mb();
241	if (!pool->curr_nr)	241	if (!pool->curr_nr) {
242	io_schedule();	242	/*
		243	* FIXME: this should be io_schedule(). The timeout is there
		244	* as a workaround for some DM problems in 2.6.18.
		245	*/
		246	io_schedule_timeout(5*HZ);
		247	}
243	finish_wait(&pool->wait, &wait);	248	finish_wait(&pool->wait, &wait);
244		249
245	goto repeat_alloc;	250	goto repeat_alloc;


diff --git a/mm/slab.c b/mm/slab.c index 0f20843beffd..21ba06035700 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache cachep, void objp)
1106		1106
1107	#endif	1107	#endif
1108		1108
1109	static int __devinit cpuup_callback(struct notifier_block *nfb,	1109	static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1110	unsigned long action, void *hcpu)	1110	unsigned long action, void *hcpu)
1111	{	1111	{
1112	long cpu = (long)hcpu;	1112	long cpu = (long)hcpu;
@@ -3224,7 +3224,7 @@ void kmem_cache_alloc(struct kmem_cache cachep, gfp_t flags)
3224	EXPORT_SYMBOL(kmem_cache_alloc);	3224	EXPORT_SYMBOL(kmem_cache_alloc);
3225		3225
3226	/**	3226	/**
3227	* kmem_cache_alloc - Allocate an object. The memory is set to zero.	3227	* kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3228	* @cache: The cache to allocate from.	3228	* @cache: The cache to allocate from.
3229	* @flags: See kmalloc().	3229	* @flags: See kmalloc().
3230	*	3230	*


diff --git a/mm/swap.c b/mm/swap.c index 8fd095c4ae51..687686a61f7c 100644 --- a/mm/swap.c +++ b/mm/swap.c
@@ -54,6 +54,26 @@ void put_page(struct page *page)
54	}	54	}
55	EXPORT_SYMBOL(put_page);	55	EXPORT_SYMBOL(put_page);
56		56
		57	/**
		58	* put_pages_list(): release a list of pages
		59	*
		60	* Release a list of pages which are strung together on page.lru. Currently
		61	* used by read_cache_pages() and related error recovery code.
		62	*
		63	* @pages: list of pages threaded on page->lru
		64	*/
		65	void put_pages_list(struct list_head *pages)
		66	{
		67	while (!list_empty(pages)) {
		68	struct page *victim;
		69
		70	victim = list_entry(pages->prev, struct page, lru);
		71	list_del(&victim->lru);
		72	page_cache_release(victim);
		73	}
		74	}
		75	EXPORT_SYMBOL(put_pages_list);
		76
57	/*	77	/*
58	* Writeback is about to end against a page which has been marked for immediate	78	* Writeback is about to end against a page which has been marked for immediate
59	* reclaim. If it still appears to be reclaimable, move it to the tail of the	79	* reclaim. If it still appears to be reclaimable, move it to the tail of the


diff --git a/mm/swapfile.c b/mm/swapfile.c index e70d6c6d6fee..f1f5ec783781 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
442		442
443	if (!(swap_info[i].flags & SWP_WRITEOK))	443	if (!(swap_info[i].flags & SWP_WRITEOK))
444	continue;	444	continue;
		445
445	if (!device) {	446	if (!device) {
446	spin_unlock(&swap_lock);	447	spin_unlock(&swap_lock);
447	return i;	448	return i;
448	}	449	}
449	inode = swap_info->swap_file->f_dentry->d_inode;	450	inode = swap_info[i].swap_file->f_dentry->d_inode;
450	if (S_ISBLK(inode->i_mode) &&	451	if (S_ISBLK(inode->i_mode) &&
451	device == MKDEV(imajor(inode), iminor(inode))) {	452	device == MKDEV(imajor(inode), iminor(inode))) {
452	spin_unlock(&swap_lock);	453	spin_unlock(&swap_lock);


diff --git a/mm/vmstat.c b/mm/vmstat.c index dfdf24133901..c1b5f4106b38 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12	#include <linux/config.h>	12	#include <linux/config.h>
13	#include <linux/mm.h>	13	#include <linux/mm.h>
14	#include <linux/module.h>	14	#include <linux/module.h>
		15	#include <linux/cpu.h>
15		16
16	void __get_zone_counts(unsigned long active, unsigned long inactive,	17	void __get_zone_counts(unsigned long active, unsigned long inactive,
17	unsigned long free, struct pglist_data pgdat)	18	unsigned long free, struct pglist_data pgdat)
@@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
114		115
115	#ifdef CONFIG_SMP	116	#ifdef CONFIG_SMP
116		117
117	#define STAT_THRESHOLD 32	118	static int calculate_threshold(struct zone *zone)
		119	{
		120	int threshold;
		121	int mem; /* memory in 128 MB units */
		122
		123	/*
		124	* The threshold scales with the number of processors and the amount
		125	* of memory per zone. More memory means that we can defer updates for
		126	* longer, more processors could lead to more contention.
		127	* fls() is used to have a cheap way of logarithmic scaling.
		128	*
		129	* Some sample thresholds:
		130	*
		131	* Threshold Processors (fls) Zonesize fls(mem+1)
		132	* ------------------------------------------------------------------
		133	* 8 1 1 0.9-1 GB 4
		134	* 16 2 2 0.9-1 GB 4
		135	* 20 2 2 1-2 GB 5
		136	* 24 2 2 2-4 GB 6
		137	* 28 2 2 4-8 GB 7
		138	* 32 2 2 8-16 GB 8
		139	* 4 2 2 <128M 1
		140	* 30 4 3 2-4 GB 5
		141	* 48 4 3 8-16 GB 8
		142	* 32 8 4 1-2 GB 4
		143	* 32 8 4 0.9-1GB 4
		144	* 10 16 5 <128M 1
		145	* 40 16 5 900M 4
		146	* 70 64 7 2-4 GB 5
		147	* 84 64 7 4-8 GB 6
		148	* 108 512 9 4-8 GB 6
		149	* 125 1024 10 8-16 GB 8
		150	* 125 1024 10 16-32 GB 9
		151	*/
		152
		153	mem = zone->present_pages >> (27 - PAGE_SHIFT);
		154
		155	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
		156
		157	/*
		158	* Maximum threshold is 125
		159	*/
		160	threshold = min(125, threshold);
		161
		162	return threshold;
		163	}
118		164
119	/*	165	/*
120	* Determine pointer to currently valid differential byte given a zone and	166	* Refresh the thresholds for each zone.
121	* the item number.
122	*
123	* Preemption must be off
124	*/	167	*/
125	static inline s8 diff_pointer(struct zone zone, enum zone_stat_item item)	168	static void refresh_zone_stat_thresholds(void)
126	{	169	{
127	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];	170	struct zone *zone;
		171	int cpu;
		172	int threshold;
		173
		174	for_each_zone(zone) {
		175
		176	if (!zone->present_pages)
		177	continue;
		178
		179	threshold = calculate_threshold(zone);
		180
		181	for_each_online_cpu(cpu)
		182	zone_pcp(zone, cpu)->stat_threshold = threshold;
		183	}
128	}	184	}
129		185
130	/*	186	/*
@@ -133,17 +189,16 @@ static inline s8 diff_pointer(struct zone zone, enum zone_stat_item item)
133	void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,	189	void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
134	int delta)	190	int delta)
135	{	191	{
136	s8 *p;	192	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
		193	s8 *p = pcp->vm_stat_diff + item;
137	long x;	194	long x;
138		195
139	p = diff_pointer(zone, item);
140	x = delta + *p;	196	x = delta + *p;
141		197
142	if (unlikely(x > STAT_THRESHOLD \|\| x < -STAT_THRESHOLD)) {	198	if (unlikely(x > pcp->stat_threshold \|\| x < -pcp->stat_threshold)) {
143	zone_page_state_add(x, zone, item);	199	zone_page_state_add(x, zone, item);
144	x = 0;	200	x = 0;
145	}	201	}
146
147	*p = x;	202	*p = x;
148	}	203	}
149	EXPORT_SYMBOL(__mod_zone_page_state);	204	EXPORT_SYMBOL(__mod_zone_page_state);
@@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
172	* No overflow check is necessary and therefore the differential can be	227	* No overflow check is necessary and therefore the differential can be
173	* incremented or decremented in place which may allow the compilers to	228	* incremented or decremented in place which may allow the compilers to
174	* generate better code.	229	* generate better code.
175	*
176	* The increment or decrement is known and therefore one boundary check can	230	* The increment or decrement is known and therefore one boundary check can
177	* be omitted.	231	* be omitted.
178	*	232	*
		233	* NOTE: These functions are very performance sensitive. Change only
		234	* with care.
		235	*
179	* Some processors have inc/dec instructions that are atomic vs an interrupt.	236	* Some processors have inc/dec instructions that are atomic vs an interrupt.
180	* However, the code must first determine the differential location in a zone	237	* However, the code must first determine the differential location in a zone
181	* based on the processor number and then inc/dec the counter. There is no	238	* based on the processor number and then inc/dec the counter. There is no
@@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
185	*/	242	*/
186	static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)	243	static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
187	{	244	{
188	s8 *p = diff_pointer(zone, item);	245	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
		246	s8 *p = pcp->vm_stat_diff + item;
189		247
190	(*p)++;	248	(*p)++;
191		249
192	if (unlikely(*p > STAT_THRESHOLD)) {	250	if (unlikely(*p > pcp->stat_threshold)) {
193	zone_page_state_add(*p, zone, item);	251	int overstep = pcp->stat_threshold / 2;
194	*p = 0;	252
		253	zone_page_state_add(*p + overstep, zone, item);
		254	*p = -overstep;
195	}	255	}
196	}	256	}
197		257
@@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
204	void __dec_zone_page_state(struct page *page, enum zone_stat_item item)	264	void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
205	{	265	{
206	struct zone *zone = page_zone(page);	266	struct zone *zone = page_zone(page);
207	s8 *p = diff_pointer(zone, item);	267	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
		268	s8 *p = pcp->vm_stat_diff + item;
208		269
209	(*p)--;	270	(*p)--;
210		271
211	if (unlikely(*p < -STAT_THRESHOLD)) {	272	if (unlikely(*p < - pcp->stat_threshold)) {
212	zone_page_state_add(*p, zone, item);	273	int overstep = pcp->stat_threshold / 2;
213	*p = 0;	274
		275	zone_page_state_add(*p - overstep, zone, item);
		276	*p = overstep;
214	}	277	}
215	}	278	}
216	EXPORT_SYMBOL(__dec_zone_page_state);	279	EXPORT_SYMBOL(__dec_zone_page_state);
@@ -239,19 +302,9 @@ EXPORT_SYMBOL(inc_zone_page_state);
239	void dec_zone_page_state(struct page *page, enum zone_stat_item item)	302	void dec_zone_page_state(struct page *page, enum zone_stat_item item)
240	{	303	{
241	unsigned long flags;	304	unsigned long flags;
242	struct zone *zone;
243	s8 *p;
244		305
245	zone = page_zone(page);
246	local_irq_save(flags);	306	local_irq_save(flags);
247	p = diff_pointer(zone, item);	307	__dec_zone_page_state(page, item);
248
249	(*p)--;
250
251	if (unlikely(*p < -STAT_THRESHOLD)) {
252	zone_page_state_add(*p, zone, item);
253	*p = 0;
254	}
255	local_irq_restore(flags);	308	local_irq_restore(flags);
256	}	309	}
257	EXPORT_SYMBOL(dec_zone_page_state);	310	EXPORT_SYMBOL(dec_zone_page_state);
@@ -525,6 +578,10 @@ static int zoneinfo_show(struct seq_file m, void arg)
525	pageset->pcp[j].high,	578	pageset->pcp[j].high,
526	pageset->pcp[j].batch);	579	pageset->pcp[j].batch);
527	}	580	}
		581	#ifdef CONFIG_SMP
		582	seq_printf(m, "\n vm stats threshold: %d",
		583	pageset->stat_threshold);
		584	#endif
528	}	585	}
529	seq_printf(m,	586	seq_printf(m,
530	"\n all_unreclaimable: %u"	587	"\n all_unreclaimable: %u"
@@ -613,3 +670,35 @@ struct seq_operations vmstat_op = {
613		670
614	#endif /* CONFIG_PROC_FS */	671	#endif /* CONFIG_PROC_FS */
615		672
		673	#ifdef CONFIG_SMP
		674	/*
		675	* Use the cpu notifier to insure that the thresholds are recalculated
		676	* when necessary.
		677	*/
		678	static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
		679	unsigned long action,
		680	void *hcpu)
		681	{
		682	switch (action) {
		683	case CPU_UP_PREPARE:
		684	case CPU_UP_CANCELED:
		685	case CPU_DEAD:
		686	refresh_zone_stat_thresholds();
		687	break;
		688	default:
		689	break;
		690	}
		691	return NOTIFY_OK;
		692	}
		693
		694	static struct notifier_block __cpuinitdata vmstat_notifier =
		695	{ &vmstat_cpuup_callback, NULL, 0 };
		696
		697	int __init setup_vmstat(void)
		698	{
		699	refresh_zone_stat_thresholds();
		700	register_cpu_notifier(&vmstat_notifier);
		701	return 0;
		702	}
		703	module_init(setup_vmstat)
		704	#endif