6 files changed, 216 insertions, 35 deletions
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
new file mode 100644
index 000000000000..3ffadf8da61f
--- /dev/null
+++ b/Documentation/vm/hwpoison.txt
@@ -0,0 +1,136 @@
+What is hwpoison?
+Upcoming Intel CPUs have support for recovering from some memory errors
+(``MCA recovery''). This requires the OS to declare a page "poisoned",
+kill the processes associated with it and avoid using it in the future.
+This patchkit implements the necessary infrastructure in the VM.
+To quote the overview comment:
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * This focusses on pages detected as corrupted in the background.
+ * When the current CPU tries to consume corruption the currently
+ * running process can just be killed directly instead. This implies
+ * that if the error cannot be handled for some reason it's safe to
+ * just ignore it because no corruption has been consumed yet. Instead
+ * when that happens another machine check will happen.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * Some of the operations here are somewhat inefficient and have non
+ * linear algorithmic complexity, because the data structures have not
+ * been optimized for this case. This is in particular the case
+ * for the mapping from a vma to a process. Since this case is expected
+ * to be rare we hope we can get away with this.
+The code consists of a the high level handler in mm/memory-failure.c,
+a new page poison bit and various checks in the VM to handle poisoned
+pages.
+The main target right now is KVM guests, but it works for all kinds
+of applications. KVM support requires a recent qemu-kvm release.
+For the KVM use there was need for a new signal type so that
+KVM can inject the machine check into the guest with the proper
+address. This in theory allows other applications to handle
+memory failures too. The expection is that near all applications
+won't do that, but some very specialized ones might.
+---
+There are two (actually three) modi memory failure recovery can be in:
+vm.memory_failure_recovery sysctl set to zero:
+        All memory failures cause a panic. Do not attempt recovery.
+        (on x86 this can be also affected by the tolerant level of the
+        MCE subsystem)
+early kill
+        (can be controlled globally and per process)
+        Send SIGBUS to the application as soon as the error is detected
+        This allows applications who can process memory errors in a gentle
+        way (e.g. drop affected object)
+        This is the mode used by KVM qemu.
+late kill
+        Send SIGBUS when the application runs into the corrupted page.
+        This is best for memory error unaware applications and default
+        Note some pages are always handled as late kill.
+---
+User control:
+vm.memory_failure_recovery
+        See sysctl.txt
+vm.memory_failure_early_kill
+        Enable early kill mode globally
+PR_MCE_KILL
+        Set early/late kill mode/revert to system default
+        arg1: PR_MCE_KILL_CLEAR: Revert to system default
+        arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
+                PR_MCE_KILL_EARLY: Early kill
+                PR_MCE_KILL_LATE:  Late kill
+                PR_MCE_KILL_DEFAULT: Use system global default
+PR_MCE_KILL_GET
+        return current mode
+---
+Testing:
+madvise(MADV_POISON, ....)
+        (as root)
+        Poison a page in the process for testing
+hwpoison-inject module through debugfs
+        /sys/debug/hwpoison/corrupt-pfn
+Inject hwpoison fault at PFN echoed into this file
+Architecture specific MCE injector
+x86 has mce-inject, mce-test
+Some portable hwpoison test programs in mce-test, see blow.
+---
+References:
+http://halobates.de/mce-lc09-2.pdf
+        Overview presentation from LinuxCon 09
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+        Test suite (hwpoison specific portable tests in tsrc)
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+        x86 specific injector
+---
+Limitations:
+- Not all page types are supported and never will. Most kernel internal
+objects cannot be recovered, only LRU pages for now.
+- Right now hugepage support is missing.
+---
+Andi Kleen, Oct 2009
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 931150566ade..a3baeb2c2161 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,6 +88,18 @@
 #define PR_TASK_PERF_EVENTS_DISABLE             31
 #define PR_TASK_PERF_EVENTS_ENABLE              32
+/*
+ * Set early/late kill mode for hwpoison memory corruption.
+ * This influences when the process gets killed on a memory corruption.
+ */
 #define PR_MCE_KILL     33
+# define PR_MCE_KILL_CLEAR   0
+# define PR_MCE_KILL_SET     1
+# define PR_MCE_KILL_LATE    0
+# define PR_MCE_KILL_EARLY   1
+# define PR_MCE_KILL_DEFAULT 2
+#define PR_MCE_KILL_GET 34
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 1828f8d10844..ce17760d9c51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1548,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        if (arg4 | arg5)
                                return -EINVAL;
                        switch (arg2) {
-                        case 0:
+                        case PR_MCE_KILL_CLEAR:
                                if (arg3 != 0)
                                        return -EINVAL;
                                current->flags &= ~PF_MCE_PROCESS;
                                break;
-                        case 1:
+                        case PR_MCE_KILL_SET:
                                current->flags |= PF_MCE_PROCESS;
-                                if (arg3 != 0)
+                                if (arg3 == PR_MCE_KILL_EARLY)
                                        current->flags |= PF_MCE_EARLY;
-                                else
+                                else if (arg3 == PR_MCE_KILL_LATE)
                                        current->flags &= ~PF_MCE_EARLY;
+                                else if (arg3 == PR_MCE_KILL_DEFAULT)
+                                        current->flags &=
+                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
+                                else
+                                        return -EINVAL;
                                break;
                        default:
                                return -EINVAL;
                        }
                        error = 0;
                        break;
+                case PR_MCE_KILL_GET:
+                        if (arg2 | arg3 | arg4 | arg5)
+                                return -EINVAL;
+                        if (current->flags & PF_MCE_PROCESS)
+                                error = (current->flags & PF_MCE_EARLY) ?
+                                        PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                        else
+                                error = PR_MCE_KILL_DEFAULT;
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 47cdd7e76f2b..12328147132c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
        int cpu;
+        int orig = -1;
        struct work_struct *works;
        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;
+        /*
+         * when running in keventd don't schedule a work item on itself.
+         * Can just call directly because the work queue is already bound.
+         * This also is faster.
+         * Make this a generic parameter for other workqueues?
+         */
+        if (current_is_keventd()) {
+                orig = raw_smp_processor_id();
+                INIT_WORK(per_cpu_ptr(works, orig), func);
+                func(per_cpu_ptr(works, orig));
+        }
        get_online_cpus();
        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);
+                if (cpu == orig)
+                        continue;
                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }
-        for_each_online_cpu(cpu)
+        for_each_online_cpu(cpu) {
-                flush_work(per_cpu_ptr(works, cpu));
+                if (cpu != orig)
+                        flush_work(per_cpu_ptr(works, cpu));
+        }
        put_online_cpus();
        free_percpu(works);
        return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7fc2130d2737..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -371,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
        int ret = FAILED;
        struct address_space *mapping;
-        if (!isolate_lru_page(p))
-                page_cache_release(p);
        /*
         * For anonymous pages we're done the only reference left
         * should be the one m_f() holds.
@@ -499,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
-        int ret = FAILED;
        ClearPageDirty(p);
        /* Trigger EIO in shmem: */
        ClearPageUptodate(p);
-        if (!isolate_lru_page(p)) {
+        return DELAYED;
-                page_cache_release(p);
-                ret = DELAYED;
-        }
-        return ret;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
-        int ret = FAILED;
-        if (!isolate_lru_page(p)) {
-                page_cache_release(p);
-                ret = RECOVERED;
-        }
        delete_from_swap_cache(p);
-        return ret;
+        return RECOVERED;
 }
 /*
@@ -612,8 +597,6 @@ static struct page_state {
        { 0,            0,              "unknown page state",   me_unknown },
 };
-#undef lru
 static void action_result(unsigned long pfn, char *msg, int result)
 {
        struct page *page = NULL;
@@ -630,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
                        unsigned long pfn, int ref)
 {
        int result;
+        int count;
        result = ps->action(p, pfn);
        action_result(pfn, ps->msg, result);
-        if (page_count(p) != 1 + ref)
+        count = page_count(p) - 1 - ref;
+        if (count != 0)
                printk(KERN_ERR
                       "MCE %#lx: %s page still referenced by %d users\n",
-                       pfn, ps->msg, page_count(p) - 1);
+                       pfn, ps->msg, count);
        /* Could do more checks here if page looks ok */
        /*
@@ -665,9 +651,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
                return;
-        if (!PageLRU(p))
-                lru_add_drain_all();
        /*
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
@@ -739,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 int __memory_failure(unsigned long pfn, int trapno, int ref)
 {
+        unsigned long lru_flag;
        struct page_state *ps;
        struct page *p;
        int res;
@@ -776,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        }
        /*
+         * We ignore non-LRU pages for good reasons.
+         * - PG_locked is only well defined for LRU pages and a few others
+         * - to avoid races with __set_page_locked()
+         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+         * The check (unnecessarily) ignores LRU pages being isolated and
+         * walked by the page reclaim code, however that's not a big loss.
+         */
+        if (!PageLRU(p))
+                lru_add_drain_all();
+        lru_flag = p->flags & lru;
+        if (isolate_lru_page(p)) {
+                action_result(pfn, "non LRU", IGNORED);
+                put_page(p);
+                return -EBUSY;
+        }
+        page_cache_release(p);
+        /*
         * Lock the page and wait for writeback to finish.
         * It's very difficult to mess with pages currently under IO
         * and in many cases impossible, so we just avoid it here.
@@ -791,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        /*
         * Torn down by someone else?
         */
-        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+        if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
                action_result(pfn, "already truncated LRU", IGNORED);
                res = 0;
                goto out;
@@ -799,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        res = -EBUSY;
        for (ps = error_states;; ps++) {
-                if ((p->flags & ps->mask) == ps->res) {
+                if (((p->flags | lru_flag)& ps->mask) == ps->res) {
                        res = page_action(ps, p, pfn, ref);
                        break;
                }
diff --git a/mm/memory.c b/mm/memory.c
index 60ea601e03ea..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2542,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        } else if (PageHWPoison(page)) {
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-                goto out;
+                goto out_release;
        }
        lock_page(page);
@@ -2614,6 +2614,7 @@ out_nomap:
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
+out_release:
        page_cache_release(page);
        return ret;
 }

diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt new file mode 100644 index 000000000000..3ffadf8da61f --- /dev/null +++ b/Documentation/vm/hwpoison.txt
@@ -0,0 +1,136 @@
		1	What is hwpoison?
		2
		3	Upcoming Intel CPUs have support for recovering from some memory errors
		4	(``MCA recovery''). This requires the OS to declare a page "poisoned",
		5	kill the processes associated with it and avoid using it in the future.
		6
		7	This patchkit implements the necessary infrastructure in the VM.
		8
		9	To quote the overview comment:
		10
		11	* High level machine check handler. Handles pages reported by the
		12	* hardware as being corrupted usually due to a 2bit ECC memory or cache
		13	* failure.
		14	*
		15	* This focusses on pages detected as corrupted in the background.
		16	* When the current CPU tries to consume corruption the currently
		17	* running process can just be killed directly instead. This implies
		18	* that if the error cannot be handled for some reason it's safe to
		19	* just ignore it because no corruption has been consumed yet. Instead
		20	* when that happens another machine check will happen.
		21	*
		22	* Handles page cache pages in various states. The tricky part
		23	* here is that we can access any page asynchronous to other VM
		24	* users, because memory failures could happen anytime and anywhere,
		25	* possibly violating some of their assumptions. This is why this code
		26	* has to be extremely careful. Generally it tries to use normal locking
		27	* rules, as in get the standard locks, even if that means the
		28	* error handling takes potentially a long time.
		29	*
		30	* Some of the operations here are somewhat inefficient and have non
		31	* linear algorithmic complexity, because the data structures have not
		32	* been optimized for this case. This is in particular the case
		33	* for the mapping from a vma to a process. Since this case is expected
		34	* to be rare we hope we can get away with this.
		35
		36	The code consists of a the high level handler in mm/memory-failure.c,
		37	a new page poison bit and various checks in the VM to handle poisoned
		38	pages.
		39
		40	The main target right now is KVM guests, but it works for all kinds
		41	of applications. KVM support requires a recent qemu-kvm release.
		42
		43	For the KVM use there was need for a new signal type so that
		44	KVM can inject the machine check into the guest with the proper
		45	address. This in theory allows other applications to handle
		46	memory failures too. The expection is that near all applications
		47	won't do that, but some very specialized ones might.
		48
		49	---
		50
		51	There are two (actually three) modi memory failure recovery can be in:
		52
		53	vm.memory_failure_recovery sysctl set to zero:
		54	All memory failures cause a panic. Do not attempt recovery.
		55	(on x86 this can be also affected by the tolerant level of the
		56	MCE subsystem)
		57
		58	early kill
		59	(can be controlled globally and per process)
		60	Send SIGBUS to the application as soon as the error is detected
		61	This allows applications who can process memory errors in a gentle
		62	way (e.g. drop affected object)
		63	This is the mode used by KVM qemu.
		64
		65	late kill
		66	Send SIGBUS when the application runs into the corrupted page.
		67	This is best for memory error unaware applications and default
		68	Note some pages are always handled as late kill.
		69
		70	---
		71
		72	User control:
		73
		74	vm.memory_failure_recovery
		75	See sysctl.txt
		76
		77	vm.memory_failure_early_kill
		78	Enable early kill mode globally
		79
		80	PR_MCE_KILL
		81	Set early/late kill mode/revert to system default
		82	arg1: PR_MCE_KILL_CLEAR: Revert to system default
		83	arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
		84	PR_MCE_KILL_EARLY: Early kill
		85	PR_MCE_KILL_LATE: Late kill
		86	PR_MCE_KILL_DEFAULT: Use system global default
		87	PR_MCE_KILL_GET
		88	return current mode
		89
		90
		91	---
		92
		93	Testing:
		94
		95	madvise(MADV_POISON, ....)
		96	(as root)
		97	Poison a page in the process for testing
		98
		99
		100	hwpoison-inject module through debugfs
		101	/sys/debug/hwpoison/corrupt-pfn
		102
		103	Inject hwpoison fault at PFN echoed into this file
		104
		105
		106	Architecture specific MCE injector
		107
		108	x86 has mce-inject, mce-test
		109
		110	Some portable hwpoison test programs in mce-test, see blow.
		111
		112	---
		113
		114	References:
		115
		116	http://halobates.de/mce-lc09-2.pdf
		117	Overview presentation from LinuxCon 09
		118
		119	git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
		120	Test suite (hwpoison specific portable tests in tsrc)
		121
		122	git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
		123	x86 specific injector
		124
		125
		126	---
		127
		128	Limitations:
		129
		130	- Not all page types are supported and never will. Most kernel internal
		131	objects cannot be recovered, only LRU pages for now.
		132	- Right now hugepage support is missing.
		133
		134	---
		135	Andi Kleen, Oct 2009
		136


diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 931150566ade..a3baeb2c2161 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h
@@ -88,6 +88,18 @@
88	#define PR_TASK_PERF_EVENTS_DISABLE 31	88	#define PR_TASK_PERF_EVENTS_DISABLE 31
89	#define PR_TASK_PERF_EVENTS_ENABLE 32	89	#define PR_TASK_PERF_EVENTS_ENABLE 32
90		90
		91	/*
		92	* Set early/late kill mode for hwpoison memory corruption.
		93	* This influences when the process gets killed on a memory corruption.
		94	*/
91	#define PR_MCE_KILL 33	95	#define PR_MCE_KILL 33
		96	# define PR_MCE_KILL_CLEAR 0
		97	# define PR_MCE_KILL_SET 1
		98
		99	# define PR_MCE_KILL_LATE 0
		100	# define PR_MCE_KILL_EARLY 1
		101	# define PR_MCE_KILL_DEFAULT 2
		102
		103	#define PR_MCE_KILL_GET 34
92		104
93	#endif /* _LINUX_PRCTL_H */	105	#endif /* _LINUX_PRCTL_H */


diff --git a/kernel/sys.c b/kernel/sys.c index 1828f8d10844..ce17760d9c51 100644 --- a/kernel/sys.c +++ b/kernel/sys.c
@@ -1548,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1548	if (arg4 \| arg5)	1548	if (arg4 \| arg5)
1549	return -EINVAL;	1549	return -EINVAL;
1550	switch (arg2) {	1550	switch (arg2) {
1551	case 0:	1551	case PR_MCE_KILL_CLEAR:
1552	if (arg3 != 0)	1552	if (arg3 != 0)
1553	return -EINVAL;	1553	return -EINVAL;
1554	current->flags &= ~PF_MCE_PROCESS;	1554	current->flags &= ~PF_MCE_PROCESS;
1555	break;	1555	break;
1556	case 1:	1556	case PR_MCE_KILL_SET:
1557	current->flags \|= PF_MCE_PROCESS;	1557	current->flags \|= PF_MCE_PROCESS;
1558	if (arg3 != 0)	1558	if (arg3 == PR_MCE_KILL_EARLY)
1559	current->flags \|= PF_MCE_EARLY;	1559	current->flags \|= PF_MCE_EARLY;
1560	else	1560	else if (arg3 == PR_MCE_KILL_LATE)
1561	current->flags &= ~PF_MCE_EARLY;	1561	current->flags &= ~PF_MCE_EARLY;
		1562	else if (arg3 == PR_MCE_KILL_DEFAULT)
		1563	current->flags &=
		1564	~(PF_MCE_EARLY\|PF_MCE_PROCESS);
		1565	else
		1566	return -EINVAL;
1562	break;	1567	break;
1563	default:	1568	default:
1564	return -EINVAL;	1569	return -EINVAL;
1565	}	1570	}
1566	error = 0;	1571	error = 0;
1567	break;	1572	break;
1568		1573	case PR_MCE_KILL_GET:
		1574	if (arg2 \| arg3 \| arg4 \| arg5)
		1575	return -EINVAL;
		1576	if (current->flags & PF_MCE_PROCESS)
		1577	error = (current->flags & PF_MCE_EARLY) ?
		1578	PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
		1579	else
		1580	error = PR_MCE_KILL_DEFAULT;
		1581	break;
1569	default:	1582	default:
1570	error = -EINVAL;	1583	error = -EINVAL;
1571	break;	1584	break;


diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 47cdd7e76f2b..12328147132c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
685	int schedule_on_each_cpu(work_func_t func)	685	int schedule_on_each_cpu(work_func_t func)
686	{	686	{
687	int cpu;	687	int cpu;
		688	int orig = -1;
688	struct work_struct *works;	689	struct work_struct *works;
689		690
690	works = alloc_percpu(struct work_struct);	691	works = alloc_percpu(struct work_struct);
691	if (!works)	692	if (!works)
692	return -ENOMEM;	693	return -ENOMEM;
693		694
		695	/*
		696	* when running in keventd don't schedule a work item on itself.
		697	* Can just call directly because the work queue is already bound.
		698	* This also is faster.
		699	* Make this a generic parameter for other workqueues?
		700	*/
		701	if (current_is_keventd()) {
		702	orig = raw_smp_processor_id();
		703	INIT_WORK(per_cpu_ptr(works, orig), func);
		704	func(per_cpu_ptr(works, orig));
		705	}
		706
694	get_online_cpus();	707	get_online_cpus();
695	for_each_online_cpu(cpu) {	708	for_each_online_cpu(cpu) {
696	struct work_struct *work = per_cpu_ptr(works, cpu);	709	struct work_struct *work = per_cpu_ptr(works, cpu);
697		710
		711	if (cpu == orig)
		712	continue;
698	INIT_WORK(work, func);	713	INIT_WORK(work, func);
699	schedule_work_on(cpu, work);	714	schedule_work_on(cpu, work);
700	}	715	}
701	for_each_online_cpu(cpu)	716	for_each_online_cpu(cpu) {
702	flush_work(per_cpu_ptr(works, cpu));	717	if (cpu != orig)
		718	flush_work(per_cpu_ptr(works, cpu));
		719	}
703	put_online_cpus();	720	put_online_cpus();
704	free_percpu(works);	721	free_percpu(works);
705	return 0;	722	return 0;


diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7fc2130d2737..dacc64183874 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c
@@ -371,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
371	int ret = FAILED;	371	int ret = FAILED;
372	struct address_space *mapping;	372	struct address_space *mapping;
373		373
374	if (!isolate_lru_page(p))
375	page_cache_release(p);
376
377	/*	374	/*
378	* For anonymous pages we're done the only reference left	375	* For anonymous pages we're done the only reference left
379	* should be the one m_f() holds.	376	* should be the one m_f() holds.
@@ -499,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
499	*/	496	*/
500	static int me_swapcache_dirty(struct page *p, unsigned long pfn)	497	static int me_swapcache_dirty(struct page *p, unsigned long pfn)
501	{	498	{
502	int ret = FAILED;
503
504	ClearPageDirty(p);	499	ClearPageDirty(p);
505	/* Trigger EIO in shmem: */	500	/* Trigger EIO in shmem: */
506	ClearPageUptodate(p);	501	ClearPageUptodate(p);
507		502
508	if (!isolate_lru_page(p)) {	503	return DELAYED;
509	page_cache_release(p);
510	ret = DELAYED;
511	}
512
513	return ret;
514	}	504	}
515		505
516	static int me_swapcache_clean(struct page *p, unsigned long pfn)	506	static int me_swapcache_clean(struct page *p, unsigned long pfn)
517	{	507	{
518	int ret = FAILED;
519
520	if (!isolate_lru_page(p)) {
521	page_cache_release(p);
522	ret = RECOVERED;
523	}
524	delete_from_swap_cache(p);	508	delete_from_swap_cache(p);
525	return ret;	509
		510	return RECOVERED;
526	}	511	}
527		512
528	/*	513	/*
@@ -612,8 +597,6 @@ static struct page_state {
612	{ 0, 0, "unknown page state", me_unknown },	597	{ 0, 0, "unknown page state", me_unknown },
613	};	598	};
614		599
615	#undef lru
616
617	static void action_result(unsigned long pfn, char *msg, int result)	600	static void action_result(unsigned long pfn, char *msg, int result)
618	{	601	{
619	struct page *page = NULL;	602	struct page *page = NULL;
@@ -630,13 +613,16 @@ static int page_action(struct page_state ps, struct page p,
630	unsigned long pfn, int ref)	613	unsigned long pfn, int ref)
631	{	614	{
632	int result;	615	int result;
		616	int count;
633		617
634	result = ps->action(p, pfn);	618	result = ps->action(p, pfn);
635	action_result(pfn, ps->msg, result);	619	action_result(pfn, ps->msg, result);
636	if (page_count(p) != 1 + ref)	620
		621	count = page_count(p) - 1 - ref;
		622	if (count != 0)
637	printk(KERN_ERR	623	printk(KERN_ERR
638	"MCE %#lx: %s page still referenced by %d users\n",	624	"MCE %#lx: %s page still referenced by %d users\n",
639	pfn, ps->msg, page_count(p) - 1);	625	pfn, ps->msg, count);
640		626
641	/* Could do more checks here if page looks ok */	627	/* Could do more checks here if page looks ok */
642	/*	628	/*
@@ -665,9 +651,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
665	if (PageReserved(p) \|\| PageCompound(p) \|\| PageSlab(p) \|\| PageKsm(p))	651	if (PageReserved(p) \|\| PageCompound(p) \|\| PageSlab(p) \|\| PageKsm(p))
666	return;	652	return;
667		653
668	if (!PageLRU(p))
669	lru_add_drain_all();
670
671	/*	654	/*
672	* This check implies we don't kill processes if their pages	655	* This check implies we don't kill processes if their pages
673	* are in the swap cache early. Those are always late kills.	656	* are in the swap cache early. Those are always late kills.
@@ -739,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
739		722
740	int __memory_failure(unsigned long pfn, int trapno, int ref)	723	int __memory_failure(unsigned long pfn, int trapno, int ref)
741	{	724	{
		725	unsigned long lru_flag;
742	struct page_state *ps;	726	struct page_state *ps;
743	struct page *p;	727	struct page *p;
744	int res;	728	int res;
@@ -776,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
776	}	760	}
777		761
778	/*	762	/*
		763	* We ignore non-LRU pages for good reasons.
		764	* - PG_locked is only well defined for LRU pages and a few others
		765	* - to avoid races with __set_page_locked()
		766	* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
		767	* The check (unnecessarily) ignores LRU pages being isolated and
		768	* walked by the page reclaim code, however that's not a big loss.
		769	*/
		770	if (!PageLRU(p))
		771	lru_add_drain_all();
		772	lru_flag = p->flags & lru;
		773	if (isolate_lru_page(p)) {
		774	action_result(pfn, "non LRU", IGNORED);
		775	put_page(p);
		776	return -EBUSY;
		777	}
		778	page_cache_release(p);
		779
		780	/*
779	* Lock the page and wait for writeback to finish.	781	* Lock the page and wait for writeback to finish.
780	* It's very difficult to mess with pages currently under IO	782	* It's very difficult to mess with pages currently under IO
781	* and in many cases impossible, so we just avoid it here.	783	* and in many cases impossible, so we just avoid it here.
@@ -791,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
791	/*	793	/*
792	* Torn down by someone else?	794	* Torn down by someone else?
793	*/	795	*/
794	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {	796	if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
795	action_result(pfn, "already truncated LRU", IGNORED);	797	action_result(pfn, "already truncated LRU", IGNORED);
796	res = 0;	798	res = 0;
797	goto out;	799	goto out;
@@ -799,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
799		801
800	res = -EBUSY;	802	res = -EBUSY;
801	for (ps = error_states;; ps++) {	803	for (ps = error_states;; ps++) {
802	if ((p->flags & ps->mask) == ps->res) {	804	if (((p->flags \| lru_flag)& ps->mask) == ps->res) {
803	res = page_action(ps, p, pfn, ref);	805	res = page_action(ps, p, pfn, ref);
804	break;	806	break;
805	}	807	}


diff --git a/mm/memory.c b/mm/memory.c index 60ea601e03ea..6ab19dd4a199 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -2542,7 +2542,7 @@ static int do_swap_page(struct mm_struct mm, struct vm_area_struct vma,
2542	} else if (PageHWPoison(page)) {	2542	} else if (PageHWPoison(page)) {
2543	ret = VM_FAULT_HWPOISON;	2543	ret = VM_FAULT_HWPOISON;
2544	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);	2544	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2545	goto out;	2545	goto out_release;
2546	}	2546	}
2547		2547
2548	lock_page(page);	2548	lock_page(page);
@@ -2614,6 +2614,7 @@ out_nomap:
2614	pte_unmap_unlock(page_table, ptl);	2614	pte_unmap_unlock(page_table, ptl);
2615	out_page:	2615	out_page:
2616	unlock_page(page);	2616	unlock_page(page);
		2617	out_release:
2617	page_cache_release(page);	2618	page_cache_release(page);
2618	return ret;	2619	return ret;
2619	}	2620	}