From f58ee00f1547ceb17b610ecfce2aa9097f1f9737 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 4 Oct 2009 02:28:42 +0200 Subject: HWPOISON: Add brief hwpoison description to Documentation Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 136 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 Documentation/vm/hwpoison.txt (limited to 'Documentation/vm/hwpoison.txt') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt new file mode 100644 index 000000000000..3ffadf8da61f --- /dev/null +++ b/Documentation/vm/hwpoison.txt @@ -0,0 +1,136 @@ +What is hwpoison? + +Upcoming Intel CPUs have support for recovering from some memory errors +(``MCA recovery''). This requires the OS to declare a page "poisoned", +kill the processes associated with it and avoid using it in the future. + +This patchkit implements the necessary infrastructure in the VM. + +To quote the overview comment: + + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * This focusses on pages detected as corrupted in the background. + * When the current CPU tries to consume corruption the currently + * running process can just be killed directly instead. This implies + * that if the error cannot be handled for some reason it's safe to + * just ignore it because no corruption has been consumed yet. Instead + * when that happens another machine check will happen. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * Some of the operations here are somewhat inefficient and have non + * linear algorithmic complexity, because the data structures have not + * been optimized for this case. This is in particular the case + * for the mapping from a vma to a process. Since this case is expected + * to be rare we hope we can get away with this. + +The code consists of a the high level handler in mm/memory-failure.c, +a new page poison bit and various checks in the VM to handle poisoned +pages. + +The main target right now is KVM guests, but it works for all kinds +of applications. KVM support requires a recent qemu-kvm release. + +For the KVM use there was need for a new signal type so that +KVM can inject the machine check into the guest with the proper +address. This in theory allows other applications to handle +memory failures too. The expection is that near all applications +won't do that, but some very specialized ones might. + +--- + +There are two (actually three) modi memory failure recovery can be in: + +vm.memory_failure_recovery sysctl set to zero: + All memory failures cause a panic. Do not attempt recovery. + (on x86 this can be also affected by the tolerant level of the + MCE subsystem) + +early kill + (can be controlled globally and per process) + Send SIGBUS to the application as soon as the error is detected + This allows applications who can process memory errors in a gentle + way (e.g. drop affected object) + This is the mode used by KVM qemu. + +late kill + Send SIGBUS when the application runs into the corrupted page. + This is best for memory error unaware applications and default + Note some pages are always handled as late kill. + +--- + +User control: + +vm.memory_failure_recovery + See sysctl.txt + +vm.memory_failure_early_kill + Enable early kill mode globally + +PR_MCE_KILL + Set early/late kill mode/revert to system default + arg1: PR_MCE_KILL_CLEAR: Revert to system default + arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode + PR_MCE_KILL_EARLY: Early kill + PR_MCE_KILL_LATE: Late kill + PR_MCE_KILL_DEFAULT: Use system global default +PR_MCE_KILL_GET + return current mode + + +--- + +Testing: + +madvise(MADV_POISON, ....) + (as root) + Poison a page in the process for testing + + +hwpoison-inject module through debugfs + /sys/debug/hwpoison/corrupt-pfn + +Inject hwpoison fault at PFN echoed into this file + + +Architecture specific MCE injector + +x86 has mce-inject, mce-test + +Some portable hwpoison test programs in mce-test, see blow. + +--- + +References: + +http://halobates.de/mce-lc09-2.pdf + Overview presentation from LinuxCon 09 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + Test suite (hwpoison specific portable tests in tsrc) + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86 specific injector + + +--- + +Limitations: + +- Not all page types are supported and never will. Most kernel internal +objects cannot be recovered, only LRU pages for now. +- Right now hugepage support is missing. + +--- +Andi Kleen, Oct 2009 + -- cgit v1.2.2 ipasa'>wip-mc-bipasa The LITMUS^RT kernel.Bjoern Brandenburg
aboutsummaryrefslogblamecommitdiffstats
blob: a19a6f1a1cf10dd54fcce72e94e9e19e135db966 (plain) (tree)
1
2
3
4
5
6
7
8
9
  



                                                     
   

                          
                 
 

                        



                                  

                                                  








                                                                       
                                                                              



                                                                    





                                       
                                                    


                                                         


                                        
                   






































































































































                                                                                
 





                                             
 

                                       

                        












                                                                       
                                                               
                                                 












































                                                               
                                                         









                                                       
                                          


                    
                                                         


















                                                                             
                                                         
 




                                                              

                                      


                                                             


                                                                 

                                      


                                                             


                                                          








                                                                             


                                                             








                                                                             








                                                        


                                                                        
                                   
     
                                   



                                               


                                                                     
                                            
     




                                                                    
      

 


                                                                         




                                                                           


                                                                         




                                                                              
                                                                               
                                                  
 
                                      


                                                
                       
                                
/*
 * Copyright (C) 2004 IBM
 *
 * Implements the generic device dma API for powerpc.
 * the pci and vio busses
 */
#ifndef _ASM_DMA_MAPPING_H
#define _ASM_DMA_MAPPING_H
#ifdef __KERNEL__

#include <linux/types.h>
#include <linux/cache.h>
/* need struct page definitions */
#include <linux/mm.h>
#include <asm/scatterlist.h>
#include <asm/io.h>

#define DMA_ERROR_CODE		(~(dma_addr_t)0x0)

#ifdef CONFIG_NOT_COHERENT_CACHE
/*
 * DMA-consistent mapping functions for PowerPCs that don't support
 * cache snooping.  These allocate/free a region of uncached mapped
 * memory space for use with DMA devices.  Alternatively, you could
 * allocate the space "normally" and use the cache management functions
 * to ensure it is consistent.
 */
extern void *__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp);
extern void __dma_free_coherent(size_t size, void *vaddr);
extern void __dma_sync(void *vaddr, size_t size, int direction);
extern void __dma_sync_page(struct page *page, unsigned long offset,
				 size_t size, int direction);

#else /* ! CONFIG_NOT_COHERENT_CACHE */
/*
 * Cache coherent cores.
 */

#define __dma_alloc_coherent(gfp, size, handle)	NULL
#define __dma_free_coherent(size, addr)		((void)0)
#define __dma_sync(addr, size, rw)		((void)0)
#define __dma_sync_page(pg, off, sz, rw)	((void)0)

#endif /* ! CONFIG_NOT_COHERENT_CACHE */

#ifdef CONFIG_PPC64
/*
 * DMA operations are abstracted for G5 vs. i/pSeries, PCI vs. VIO
 */
struct dma_mapping_ops {
	void *		(*alloc_coherent)(struct device *dev, size_t size,
				dma_addr_t *dma_handle, gfp_t flag);
	void		(*free_coherent)(struct device *dev, size_t size,
				void *vaddr, dma_addr_t dma_handle);
	dma_addr_t	(*map_single)(struct device *dev, void *ptr,
				size_t size, enum dma_data_direction direction);
	void		(*unmap_single)(struct device *dev, dma_addr_t dma_addr,
				size_t size, enum dma_data_direction direction);
	int		(*map_sg)(struct device *dev, struct scatterlist *sg,
				int nents, enum dma_data_direction direction);