From 8fe23e057172223fe2048768a4d87ab7de7477bc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 14 Dec 2009 17:58:33 -0800 Subject: mm: clear node in N_HIGH_MEMORY and stop kswapd when all memory is offlined When memory is hot-removed, its node must be cleared in N_HIGH_MEMORY if there are no present pages left. In such a situation, kswapd must also be stopped since it has nothing left to do. Signed-off-by: David Rientjes Signed-off-by: Lee Schermerhorn Cc: Christoph Lameter Cc: Yasunori Goto Cc: Mel Gorman Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Randy Dunlap Cc: Nishanth Aravamudan Cc: Andi Kleen Cc: David Rientjes Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4ec90019c1a4..abce8a0b2507 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -273,6 +273,7 @@ extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); extern int kswapd_run(int nid); +extern void kswapd_stop(int nid); #ifdef CONFIG_MMU /* linux/mm/shmem.c */ -- cgit v1.2.2 From f29ad6a99b596b8169744d107bf088e8be9e8d0d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:40 -0800 Subject: swap_info: private to swapfile.c The swap_info_struct is mostly private to mm/swapfile.c, with only one other in-tree user: get_swap_bio(). Adjust its interface to map_swap_page(), so that we can then remove get_swap_info_struct(). But there is a popular user out-of-tree, TuxOnIce: so leave the declaration of swap_info_struct in linux/swap.h. Signed-off-by: Hugh Dickins Cc: Nigel Cunningham Cc: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index abce8a0b2507..82aa7e121c05 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -318,9 +318,8 @@ extern void swapcache_free(swp_entry_t, struct page *page); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); -extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); +extern sector_t map_swap_page(swp_entry_t, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); -extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -- cgit v1.2.2 From efa90a981bbc891efad96db2a75b5487e00852ca Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:41 -0800 Subject: swap_info: change to array of pointers The swap_info_struct is only 76 or 104 bytes, but it does seem wrong to reserve an array of about 30 of them in bss, when most people will want only one. Change swap_info[] to an array of pointers. That does need a "type" field in the structure: pack it as a char with next type and short prio (aha, char is unsigned by default on PowerPC). Use the (admittedly peculiar) name "type" throughout for this index. /proc/swaps does not take swap_lock: I wouldn't want it to, but do take care with barriers when adding a new item to the array (never removed). Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 82aa7e121c05..f1c248796fb8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -159,9 +159,10 @@ enum { * The in-memory structure used to track swap areas. */ struct swap_info_struct { - unsigned long flags; - int prio; /* swap priority */ - int next; /* next entry on swap list */ + unsigned long flags; /* SWP_USED etc: see above */ + signed short prio; /* swap priority of this type */ + signed char type; /* strange name for an index */ + signed char next; /* next type on the swap list */ struct file *swap_file; struct block_device *bdev; struct list_head extent_list; -- cgit v1.2.2 From 9625a5f289f7c3c100b59c317e2bcc3c7e2e51fb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:42 -0800 Subject: swap_info: include first_swap_extent Make better use of the space by folding first swap_extent into its swap_info_struct, instead of just the list_head: swap partitions need only that one, and for others it's used as a circular list anyway. [jirislaby@gmail.com: fix crash on double swapon] Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index f1c248796fb8..109dfe794237 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -165,7 +165,7 @@ struct swap_info_struct { signed char next; /* next type on the swap list */ struct file *swap_file; struct block_device *bdev; - struct list_head extent_list; + struct swap_extent first_swap_extent; struct swap_extent *curr_swap_extent; unsigned short *swap_map; unsigned int lowest_bit; -- cgit v1.2.2 From 253d553ba75ab26b3e9e2f70cbf6fbf0813f7e86 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:44 -0800 Subject: swap_info: SWAP_HAS_CACHE cleanups Though swap_count() is useful, I'm finding that swap_has_cache() and encode_swapmap() obscure what happens in the swap_map entry, just at those points where I need to understand it. Remove them, and pass more usable "usage" values to scan_swap_map(), swap_entry_free() and __swap_duplicate(), instead of the SWAP_MAP and SWAP_CACHE enum. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 109dfe794237..c9d8870892b8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -154,7 +154,7 @@ enum { #define SWAP_MAP_MAX 0x7ffe #define SWAP_MAP_BAD 0x7fff #define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */ -#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE) + /* * The in-memory structure used to track swap areas. */ -- cgit v1.2.2 From 8d69aaee80c123b460918816cbfa2e83224c3646 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:45 -0800 Subject: swap_info: swap_map of chars not shorts Halve the vmalloc'ed swap_map array from unsigned shorts to unsigned chars: it's still very unusual to reach a swap count of 126, and the next patch allows it to be extended indefinitely. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index c9d8870892b8..f733deb10748 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -151,9 +151,9 @@ enum { #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7ffe -#define SWAP_MAP_BAD 0x7fff -#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */ +#define SWAP_MAP_MAX 0x7e +#define SWAP_MAP_BAD 0x7f +#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */ /* * The in-memory structure used to track swap areas. @@ -167,7 +167,7 @@ struct swap_info_struct { struct block_device *bdev; struct swap_extent first_swap_extent; struct swap_extent *curr_swap_extent; - unsigned short *swap_map; + unsigned char *swap_map; unsigned int lowest_bit; unsigned int highest_bit; unsigned int lowest_alloc; /* while preparing discard cluster */ -- cgit v1.2.2 From 570a335b8e22579e2a51a68136d2b1f907a20eec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:46 -0800 Subject: swap_info: swap count continuations Swap is duplicated (reference count incremented by one) whenever the same swap page is inserted into another mm (when forking finds a swap entry in place of a pte, or when reclaim unmaps a pte to insert the swap entry). swap_info_struct's vmalloc'ed swap_map is the array of these reference counts: but what happens when the unsigned short (or unsigned char since the preceding patch) is full? (and its high bit is kept for a cache flag) We then lose track of it, never freeing, leaving it in use until swapoff: at which point we _hope_ that a single pass will have found all instances, assume there are no more, and will lose user data if we're wrong. Swapping of KSM pages has not yet been enabled; but it is implemented, and makes it very easy for a user to overflow the maximum swap count: possible with ordinary process pages, but unlikely, even when pid_max has been raised from PID_MAX_DEFAULT. This patch implements swap count continuations: when the count overflows, a continuation page is allocated and linked to the original vmalloc'ed map page, and this used to hold the continuation counts for that entry and its neighbours. These continuation pages are seldom referenced: the common paths all work on the original swap_map, only referring to a continuation page when the low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index f733deb10748..389e7bd92cca 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -145,15 +145,18 @@ enum { SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ + SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7e -#define SWAP_MAP_BAD 0x7f -#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */ +#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ +#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ +#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ +#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ +#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ /* * The in-memory structure used to track swap areas. @@ -311,9 +314,10 @@ extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); -extern void swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); +extern int add_swap_count_continuation(swp_entry_t, gfp_t); +extern int swap_duplicate(swp_entry_t); +extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free(swp_entry_t, struct page *page); extern int free_swap_and_cache(swp_entry_t); @@ -385,8 +389,14 @@ static inline void show_swap_cache_info(void) #define free_swap_and_cache(swp) is_migration_entry(swp) #define swapcache_prepare(swp) is_migration_entry(swp) -static inline void swap_duplicate(swp_entry_t swp) +static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { + return 0; +} + +static inline int swap_duplicate(swp_entry_t swp) +{ + return 0; } static inline void swap_free(swp_entry_t swp) -- cgit v1.2.2 From aaa468653b4a0d11c603c48d716f765177a5a9e4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:47 -0800 Subject: swap_info: note SWAP_MAP_SHMEM While we're fiddling with the swap_map values, let's assign a particular value to shmem/tmpfs swap pages: their swap counts are never incremented, and it helps swapoff's try_to_unuse() a little if it can immediately distinguish those pages from process pages. Since we've no use for SWAP_MAP_BAD | COUNT_CONTINUED, we might as well use that 0xbf value for SWAP_MAP_SHMEM. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 389e7bd92cca..ac43d87b89b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -157,6 +157,7 @@ enum { #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ +#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ /* * The in-memory structure used to track swap areas. @@ -316,6 +317,7 @@ extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern int add_swap_count_continuation(swp_entry_t, gfp_t); +extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); @@ -394,6 +396,10 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } +static inline void swap_shmem_alloc(swp_entry_t swp) +{ +} + static inline int swap_duplicate(swp_entry_t swp) { return 0; -- cgit v1.2.2 From 7509765a29cfb1a4c506c09b304aaf3b4111c653 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:48 -0800 Subject: swap_info: reorder its fields Reorder (and comment) the fields of swap_info_struct, to make better use of its cachelines: it's good for swap_duplicate() in particular if unsigned int max and swap_map are near the start. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index ac43d87b89b0..9f0ca325e30d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -167,21 +167,21 @@ struct swap_info_struct { signed short prio; /* swap priority of this type */ signed char type; /* strange name for an index */ signed char next; /* next type on the swap list */ - struct file *swap_file; - struct block_device *bdev; - struct swap_extent first_swap_extent; - struct swap_extent *curr_swap_extent; - unsigned char *swap_map; - unsigned int lowest_bit; - unsigned int highest_bit; + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ + unsigned int inuse_pages; /* number of those currently in use */ + unsigned int cluster_next; /* likely index for next allocation */ + unsigned int cluster_nr; /* countdown to next cluster search */ unsigned int lowest_alloc; /* while preparing discard cluster */ unsigned int highest_alloc; /* while preparing discard cluster */ - unsigned int cluster_next; - unsigned int cluster_nr; - unsigned int pages; - unsigned int max; - unsigned int inuse_pages; - unsigned int old_block_size; + struct swap_extent *curr_swap_extent; + struct swap_extent first_swap_extent; + struct block_device *bdev; /* swap device or bdev of swap file */ + struct file *swap_file; /* seldom referenced */ + unsigned int old_block_size; /* seldom referenced */ }; struct swap_list_t { -- cgit v1.2.2 From d4906e1aa516cc965292b43b5a26122dd4344e7e Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 14 Dec 2009 17:58:49 -0800 Subject: swap: rework map_swap_page() again Seems that page_io.c doesn't really need to know that page_private(page) is the swp_entry 'val'. Rework map_swap_page() to do what its name says and map a page to a page offset in the swap space. The only other caller of map_swap_page() is internal to mm/swapfile.c and it does want to map a swap entry to the 'sector'. So rename map_swap_page() to map_swap_entry(), make it 'static' and and implement map_swap_page() as a wrapper around that. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/swap.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 9f0ca325e30d..a2602a8207a6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -325,7 +325,7 @@ extern void swapcache_free(swp_entry_t, struct page *page); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); -extern sector_t map_swap_page(swp_entry_t, struct block_device **); +extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); -- cgit v1.2.2