aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c600
1 files changed, 378 insertions, 222 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87e5162..da422c47e2ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shm.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h>
19#include <linux/writeback.h> 20#include <linux/writeback.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/seq_file.h> 22#include <linux/seq_file.h>
@@ -32,9 +33,11 @@
32#include <asm/pgtable.h> 33#include <asm/pgtable.h>
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34#include <linux/swapops.h> 35#include <linux/swapops.h>
36#include <linux/page_cgroup.h>
35 37
36static DEFINE_SPINLOCK(swap_lock); 38static DEFINE_SPINLOCK(swap_lock);
37static unsigned int nr_swapfiles; 39static unsigned int nr_swapfiles;
40long nr_swap_pages;
38long total_swap_pages; 41long total_swap_pages;
39static int swap_overflow; 42static int swap_overflow;
40static int least_priority; 43static int least_priority;
@@ -83,15 +86,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
83 up_read(&swap_unplug_sem); 86 up_read(&swap_unplug_sem);
84} 87}
85 88
89/*
90 * swapon tell device that all the old swap contents can be discarded,
91 * to allow the swap device to optimize its wear-levelling.
92 */
93static int discard_swap(struct swap_info_struct *si)
94{
95 struct swap_extent *se;
96 int err = 0;
97
98 list_for_each_entry(se, &si->extent_list, list) {
99 sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
100 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
101
102 if (se->start_page == 0) {
103 /* Do not discard the swap header page! */
104 start_block += 1 << (PAGE_SHIFT - 9);
105 nr_blocks -= 1 << (PAGE_SHIFT - 9);
106 if (!nr_blocks)
107 continue;
108 }
109
110 err = blkdev_issue_discard(si->bdev, start_block,
111 nr_blocks, GFP_KERNEL);
112 if (err)
113 break;
114
115 cond_resched();
116 }
117 return err; /* That will often be -EOPNOTSUPP */
118}
119
120/*
121 * swap allocation tell device that a cluster of swap can now be discarded,
122 * to allow the swap device to optimize its wear-levelling.
123 */
124static void discard_swap_cluster(struct swap_info_struct *si,
125 pgoff_t start_page, pgoff_t nr_pages)
126{
127 struct swap_extent *se = si->curr_swap_extent;
128 int found_extent = 0;
129
130 while (nr_pages) {
131 struct list_head *lh;
132
133 if (se->start_page <= start_page &&
134 start_page < se->start_page + se->nr_pages) {
135 pgoff_t offset = start_page - se->start_page;
136 sector_t start_block = se->start_block + offset;
137 sector_t nr_blocks = se->nr_pages - offset;
138
139 if (nr_blocks > nr_pages)
140 nr_blocks = nr_pages;
141 start_page += nr_blocks;
142 nr_pages -= nr_blocks;
143
144 if (!found_extent++)
145 si->curr_swap_extent = se;
146
147 start_block <<= PAGE_SHIFT - 9;
148 nr_blocks <<= PAGE_SHIFT - 9;
149 if (blkdev_issue_discard(si->bdev, start_block,
150 nr_blocks, GFP_NOIO))
151 break;
152 }
153
154 lh = se->list.next;
155 if (lh == &si->extent_list)
156 lh = lh->next;
157 se = list_entry(lh, struct swap_extent, list);
158 }
159}
160
161static int wait_for_discard(void *word)
162{
163 schedule();
164 return 0;
165}
166
86#define SWAPFILE_CLUSTER 256 167#define SWAPFILE_CLUSTER 256
87#define LATENCY_LIMIT 256 168#define LATENCY_LIMIT 256
88 169
89static inline unsigned long scan_swap_map(struct swap_info_struct *si) 170static inline unsigned long scan_swap_map(struct swap_info_struct *si)
90{ 171{
91 unsigned long offset, last_in_cluster; 172 unsigned long offset;
173 unsigned long scan_base;
174 unsigned long last_in_cluster = 0;
92 int latency_ration = LATENCY_LIMIT; 175 int latency_ration = LATENCY_LIMIT;
176 int found_free_cluster = 0;
93 177
94 /* 178 /*
95 * We try to cluster swap pages by allocating them sequentially 179 * We try to cluster swap pages by allocating them sequentially
96 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 180 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
97 * way, however, we resort to first-free allocation, starting 181 * way, however, we resort to first-free allocation, starting
@@ -99,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
99 * all over the entire swap partition, so that we reduce 183 * all over the entire swap partition, so that we reduce
100 * overall disk seek times between swap pages. -- sct 184 * overall disk seek times between swap pages. -- sct
101 * But we do now try to find an empty cluster. -Andrea 185 * But we do now try to find an empty cluster. -Andrea
186 * And we let swap pages go all over an SSD partition. Hugh
102 */ 187 */
103 188
104 si->flags += SWP_SCANNING; 189 si->flags += SWP_SCANNING;
105 if (unlikely(!si->cluster_nr)) { 190 scan_base = offset = si->cluster_next;
106 si->cluster_nr = SWAPFILE_CLUSTER - 1; 191
107 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) 192 if (unlikely(!si->cluster_nr--)) {
108 goto lowest; 193 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
194 si->cluster_nr = SWAPFILE_CLUSTER - 1;
195 goto checks;
196 }
197 if (si->flags & SWP_DISCARDABLE) {
198 /*
199 * Start range check on racing allocations, in case
200 * they overlap the cluster we eventually decide on
201 * (we scan without swap_lock to allow preemption).
202 * It's hardly conceivable that cluster_nr could be
203 * wrapped during our scan, but don't depend on it.
204 */
205 if (si->lowest_alloc)
206 goto checks;
207 si->lowest_alloc = si->max;
208 si->highest_alloc = 0;
209 }
109 spin_unlock(&swap_lock); 210 spin_unlock(&swap_lock);
110 211
111 offset = si->lowest_bit; 212 /*
213 * If seek is expensive, start searching for new cluster from
214 * start of partition, to minimize the span of allocated swap.
215 * But if seek is cheap, search from our current position, so
216 * that swap is allocated from all over the partition: if the
217 * Flash Translation Layer only remaps within limited zones,
218 * we don't want to wear out the first zone too quickly.
219 */
220 if (!(si->flags & SWP_SOLIDSTATE))
221 scan_base = offset = si->lowest_bit;
112 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 222 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
113 223
114 /* Locate the first empty (unaligned) cluster */ 224 /* Locate the first empty (unaligned) cluster */
@@ -117,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
117 last_in_cluster = offset + SWAPFILE_CLUSTER; 227 last_in_cluster = offset + SWAPFILE_CLUSTER;
118 else if (offset == last_in_cluster) { 228 else if (offset == last_in_cluster) {
119 spin_lock(&swap_lock); 229 spin_lock(&swap_lock);
120 si->cluster_next = offset-SWAPFILE_CLUSTER+1; 230 offset -= SWAPFILE_CLUSTER - 1;
121 goto cluster; 231 si->cluster_next = offset;
232 si->cluster_nr = SWAPFILE_CLUSTER - 1;
233 found_free_cluster = 1;
234 goto checks;
122 } 235 }
123 if (unlikely(--latency_ration < 0)) { 236 if (unlikely(--latency_ration < 0)) {
124 cond_resched(); 237 cond_resched();
125 latency_ration = LATENCY_LIMIT; 238 latency_ration = LATENCY_LIMIT;
126 } 239 }
127 } 240 }
241
242 offset = si->lowest_bit;
243 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
244
245 /* Locate the first empty (unaligned) cluster */
246 for (; last_in_cluster < scan_base; offset++) {
247 if (si->swap_map[offset])
248 last_in_cluster = offset + SWAPFILE_CLUSTER;
249 else if (offset == last_in_cluster) {
250 spin_lock(&swap_lock);
251 offset -= SWAPFILE_CLUSTER - 1;
252 si->cluster_next = offset;
253 si->cluster_nr = SWAPFILE_CLUSTER - 1;
254 found_free_cluster = 1;
255 goto checks;
256 }
257 if (unlikely(--latency_ration < 0)) {
258 cond_resched();
259 latency_ration = LATENCY_LIMIT;
260 }
261 }
262
263 offset = scan_base;
128 spin_lock(&swap_lock); 264 spin_lock(&swap_lock);
129 goto lowest; 265 si->cluster_nr = SWAPFILE_CLUSTER - 1;
266 si->lowest_alloc = 0;
130 } 267 }
131 268
132 si->cluster_nr--; 269checks:
133cluster: 270 if (!(si->flags & SWP_WRITEOK))
134 offset = si->cluster_next;
135 if (offset > si->highest_bit)
136lowest: offset = si->lowest_bit;
137checks: if (!(si->flags & SWP_WRITEOK))
138 goto no_page; 271 goto no_page;
139 if (!si->highest_bit) 272 if (!si->highest_bit)
140 goto no_page; 273 goto no_page;
141 if (!si->swap_map[offset]) { 274 if (offset > si->highest_bit)
142 if (offset == si->lowest_bit) 275 scan_base = offset = si->lowest_bit;
143 si->lowest_bit++; 276 if (si->swap_map[offset])
144 if (offset == si->highest_bit) 277 goto scan;
145 si->highest_bit--; 278
146 si->inuse_pages++; 279 if (offset == si->lowest_bit)
147 if (si->inuse_pages == si->pages) { 280 si->lowest_bit++;
148 si->lowest_bit = si->max; 281 if (offset == si->highest_bit)
149 si->highest_bit = 0; 282 si->highest_bit--;
283 si->inuse_pages++;
284 if (si->inuse_pages == si->pages) {
285 si->lowest_bit = si->max;
286 si->highest_bit = 0;
287 }
288 si->swap_map[offset] = 1;
289 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING;
291
292 if (si->lowest_alloc) {
293 /*
294 * Only set when SWP_DISCARDABLE, and there's a scan
295 * for a free cluster in progress or just completed.
296 */
297 if (found_free_cluster) {
298 /*
299 * To optimize wear-levelling, discard the
300 * old data of the cluster, taking care not to
301 * discard any of its pages that have already
302 * been allocated by racing tasks (offset has
303 * already stepped over any at the beginning).
304 */
305 if (offset < si->highest_alloc &&
306 si->lowest_alloc <= last_in_cluster)
307 last_in_cluster = si->lowest_alloc - 1;
308 si->flags |= SWP_DISCARDING;
309 spin_unlock(&swap_lock);
310
311 if (offset < last_in_cluster)
312 discard_swap_cluster(si, offset,
313 last_in_cluster - offset + 1);
314
315 spin_lock(&swap_lock);
316 si->lowest_alloc = 0;
317 si->flags &= ~SWP_DISCARDING;
318
319 smp_mb(); /* wake_up_bit advises this */
320 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
321
322 } else if (si->flags & SWP_DISCARDING) {
323 /*
324 * Delay using pages allocated by racing tasks
325 * until the whole discard has been issued. We
326 * could defer that delay until swap_writepage,
327 * but it's easier to keep this self-contained.
328 */
329 spin_unlock(&swap_lock);
330 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
331 wait_for_discard, TASK_UNINTERRUPTIBLE);
332 spin_lock(&swap_lock);
333 } else {
334 /*
335 * Note pages allocated by racing tasks while
336 * scan for a free cluster is in progress, so
337 * that its final discard can exclude them.
338 */
339 if (offset < si->lowest_alloc)
340 si->lowest_alloc = offset;
341 if (offset > si->highest_alloc)
342 si->highest_alloc = offset;
150 } 343 }
151 si->swap_map[offset] = 1;
152 si->cluster_next = offset + 1;
153 si->flags -= SWP_SCANNING;
154 return offset;
155 } 344 }
345 return offset;
156 346
347scan:
157 spin_unlock(&swap_lock); 348 spin_unlock(&swap_lock);
158 while (++offset <= si->highest_bit) { 349 while (++offset <= si->highest_bit) {
159 if (!si->swap_map[offset]) { 350 if (!si->swap_map[offset]) {
@@ -165,8 +356,18 @@ checks: if (!(si->flags & SWP_WRITEOK))
165 latency_ration = LATENCY_LIMIT; 356 latency_ration = LATENCY_LIMIT;
166 } 357 }
167 } 358 }
359 offset = si->lowest_bit;
360 while (++offset < scan_base) {
361 if (!si->swap_map[offset]) {
362 spin_lock(&swap_lock);
363 goto checks;
364 }
365 if (unlikely(--latency_ration < 0)) {
366 cond_resched();
367 latency_ration = LATENCY_LIMIT;
368 }
369 }
168 spin_lock(&swap_lock); 370 spin_lock(&swap_lock);
169 goto lowest;
170 371
171no_page: 372no_page:
172 si->flags -= SWP_SCANNING; 373 si->flags -= SWP_SCANNING;
@@ -268,10 +469,11 @@ bad_nofile:
268 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 469 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
269out: 470out:
270 return NULL; 471 return NULL;
271} 472}
272 473
273static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
274{ 475{
476 unsigned long offset = swp_offset(ent);
275 int count = p->swap_map[offset]; 477 int count = p->swap_map[offset];
276 478
277 if (count < SWAP_MAP_MAX) { 479 if (count < SWAP_MAP_MAX) {
@@ -286,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
286 swap_list.next = p - swap_info; 488 swap_list.next = p - swap_info;
287 nr_swap_pages++; 489 nr_swap_pages++;
288 p->inuse_pages--; 490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
289 } 492 }
290 } 493 }
291 return count; 494 return count;
@@ -301,7 +504,7 @@ void swap_free(swp_entry_t entry)
301 504
302 p = swap_info_get(entry); 505 p = swap_info_get(entry);
303 if (p) { 506 if (p) {
304 swap_entry_free(p, swp_offset(entry)); 507 swap_entry_free(p, entry);
305 spin_unlock(&swap_lock); 508 spin_unlock(&swap_lock);
306 } 509 }
307} 510}
@@ -326,101 +529,62 @@ static inline int page_swapcount(struct page *page)
326} 529}
327 530
328/* 531/*
329 * We can use this swap cache entry directly 532 * We can write to an anon page without COW if there are no other references
330 * if there are no other references to it. 533 * to it. And as a side-effect, free up its swap: because the old content
534 * on disk will never be read, and seeking back there to write new content
535 * later would only waste time away from clustering.
331 */ 536 */
332int can_share_swap_page(struct page *page) 537int reuse_swap_page(struct page *page)
333{ 538{
334 int count; 539 int count;
335 540
336 BUG_ON(!PageLocked(page)); 541 VM_BUG_ON(!PageLocked(page));
337 count = page_mapcount(page); 542 count = page_mapcount(page);
338 if (count <= 1 && PageSwapCache(page)) 543 if (count <= 1 && PageSwapCache(page)) {
339 count += page_swapcount(page); 544 count += page_swapcount(page);
545 if (count == 1 && !PageWriteback(page)) {
546 delete_from_swap_cache(page);
547 SetPageDirty(page);
548 }
549 }
340 return count == 1; 550 return count == 1;
341} 551}
342 552
343/* 553/*
344 * Work out if there are any other processes sharing this 554 * If swap is getting full, or if there are no more mappings of this page,
345 * swap cache page. Free it if you can. Return success. 555 * then try_to_free_swap is called to free its swap space.
346 */ 556 */
347static int remove_exclusive_swap_page_count(struct page *page, int count) 557int try_to_free_swap(struct page *page)
348{ 558{
349 int retval; 559 VM_BUG_ON(!PageLocked(page));
350 struct swap_info_struct * p;
351 swp_entry_t entry;
352
353 BUG_ON(PagePrivate(page));
354 BUG_ON(!PageLocked(page));
355 560
356 if (!PageSwapCache(page)) 561 if (!PageSwapCache(page))
357 return 0; 562 return 0;
358 if (PageWriteback(page)) 563 if (PageWriteback(page))
359 return 0; 564 return 0;
360 if (page_count(page) != count) /* us + cache + ptes */ 565 if (page_swapcount(page))
361 return 0; 566 return 0;
362 567
363 entry.val = page_private(page); 568 delete_from_swap_cache(page);
364 p = swap_info_get(entry); 569 SetPageDirty(page);
365 if (!p) 570 return 1;
366 return 0;
367
368 /* Is the only swap cache user the cache itself? */
369 retval = 0;
370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page);
375 SetPageDirty(page);
376 retval = 1;
377 }
378 spin_unlock_irq(&swapper_space.tree_lock);
379 }
380 spin_unlock(&swap_lock);
381
382 if (retval) {
383 swap_free(entry);
384 page_cache_release(page);
385 }
386
387 return retval;
388}
389
390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407} 571}
408 572
409/* 573/*
410 * Free the swap entry like above, but also try to 574 * Free the swap entry like above, but also try to
411 * free the page cache entry if it is the last user. 575 * free the page cache entry if it is the last user.
412 */ 576 */
413void free_swap_and_cache(swp_entry_t entry) 577int free_swap_and_cache(swp_entry_t entry)
414{ 578{
415 struct swap_info_struct * p; 579 struct swap_info_struct *p;
416 struct page *page = NULL; 580 struct page *page = NULL;
417 581
418 if (is_migration_entry(entry)) 582 if (is_migration_entry(entry))
419 return; 583 return 1;
420 584
421 p = swap_info_get(entry); 585 p = swap_info_get(entry);
422 if (p) { 586 if (p) {
423 if (swap_entry_free(p, swp_offset(entry)) == 1) { 587 if (swap_entry_free(p, entry) == 1) {
424 page = find_get_page(&swapper_space, entry.val); 588 page = find_get_page(&swapper_space, entry.val);
425 if (page && !trylock_page(page)) { 589 if (page && !trylock_page(page)) {
426 page_cache_release(page); 590 page_cache_release(page);
@@ -430,20 +594,19 @@ void free_swap_and_cache(swp_entry_t entry)
430 spin_unlock(&swap_lock); 594 spin_unlock(&swap_lock);
431 } 595 }
432 if (page) { 596 if (page) {
433 int one_user; 597 /*
434 598 * Not mapped elsewhere, or swap space full? Free it!
435 BUG_ON(PagePrivate(page)); 599 * Also recheck PageSwapCache now page is locked (above).
436 one_user = (page_count(page) == 2); 600 */
437 /* Only cache user (+us), or swap space full? Free it! */
438 /* Also recheck PageSwapCache after page is locked (above) */
439 if (PageSwapCache(page) && !PageWriteback(page) && 601 if (PageSwapCache(page) && !PageWriteback(page) &&
440 (one_user || vm_swap_full())) { 602 (!page_mapped(page) || vm_swap_full())) {
441 delete_from_swap_cache(page); 603 delete_from_swap_cache(page);
442 SetPageDirty(page); 604 SetPageDirty(page);
443 } 605 }
444 unlock_page(page); 606 unlock_page(page);
445 page_cache_release(page); 607 page_cache_release(page);
446 } 608 }
609 return p != NULL;
447} 610}
448 611
449#ifdef CONFIG_HIBERNATION 612#ifdef CONFIG_HIBERNATION
@@ -530,17 +693,18 @@ unsigned int count_swap_pages(int type, int free)
530static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 693static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
531 unsigned long addr, swp_entry_t entry, struct page *page) 694 unsigned long addr, swp_entry_t entry, struct page *page)
532{ 695{
696 struct mem_cgroup *ptr = NULL;
533 spinlock_t *ptl; 697 spinlock_t *ptl;
534 pte_t *pte; 698 pte_t *pte;
535 int ret = 1; 699 int ret = 1;
536 700
537 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 701 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr))
538 ret = -ENOMEM; 702 ret = -ENOMEM;
539 703
540 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 704 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
541 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 705 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
542 if (ret > 0) 706 if (ret > 0)
543 mem_cgroup_uncharge_page(page); 707 mem_cgroup_cancel_charge_swapin(ptr);
544 ret = 0; 708 ret = 0;
545 goto out; 709 goto out;
546 } 710 }
@@ -550,6 +714,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
550 set_pte_at(vma->vm_mm, addr, pte, 714 set_pte_at(vma->vm_mm, addr, pte,
551 pte_mkold(mk_pte(page, vma->vm_page_prot))); 715 pte_mkold(mk_pte(page, vma->vm_page_prot)));
552 page_add_anon_rmap(page, vma, addr); 716 page_add_anon_rmap(page, vma, addr);
717 mem_cgroup_commit_charge_swapin(page, ptr);
553 swap_free(entry); 718 swap_free(entry);
554 /* 719 /*
555 * Move the page to the active list so it is not 720 * Move the page to the active list so it is not
@@ -776,10 +941,10 @@ static int try_to_unuse(unsigned int type)
776 break; 941 break;
777 } 942 }
778 943
779 /* 944 /*
780 * Get a page for the entry, using the existing swap 945 * Get a page for the entry, using the existing swap
781 * cache page if there is one. Otherwise, get a clean 946 * cache page if there is one. Otherwise, get a clean
782 * page and read the swap into it. 947 * page and read the swap into it.
783 */ 948 */
784 swap_map = &si->swap_map[i]; 949 swap_map = &si->swap_map[i];
785 entry = swp_entry(type, i); 950 entry = swp_entry(type, i);
@@ -930,7 +1095,16 @@ static int try_to_unuse(unsigned int type)
930 lock_page(page); 1095 lock_page(page);
931 wait_on_page_writeback(page); 1096 wait_on_page_writeback(page);
932 } 1097 }
933 if (PageSwapCache(page)) 1098
1099 /*
1100 * It is conceivable that a racing task removed this page from
1101 * swap cache just before we acquired the page lock at the top,
1102 * or while we dropped it in unuse_mm(). The page might even
1103 * be back in swap cache on another swap area: that we must not
1104 * delete, since it may not have been written out to swap yet.
1105 */
1106 if (PageSwapCache(page) &&
1107 likely(page_private(page) == entry.val))
934 delete_from_swap_cache(page); 1108 delete_from_swap_cache(page);
935 1109
936 /* 1110 /*
@@ -1203,26 +1377,6 @@ out:
1203 return ret; 1377 return ret;
1204} 1378}
1205 1379
1206#if 0 /* We don't need this yet */
1207#include <linux/backing-dev.h>
1208int page_queue_congested(struct page *page)
1209{
1210 struct backing_dev_info *bdi;
1211
1212 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1213
1214 if (PageSwapCache(page)) {
1215 swp_entry_t entry = { .val = page_private(page) };
1216 struct swap_info_struct *sis;
1217
1218 sis = get_swap_info_struct(swp_type(entry));
1219 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
1220 } else
1221 bdi = page->mapping->backing_dev_info;
1222 return bdi_write_congested(bdi);
1223}
1224#endif
1225
1226asmlinkage long sys_swapoff(const char __user * specialfile) 1380asmlinkage long sys_swapoff(const char __user * specialfile)
1227{ 1381{
1228 struct swap_info_struct * p = NULL; 1382 struct swap_info_struct * p = NULL;
@@ -1233,7 +1387,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1233 char * pathname; 1387 char * pathname;
1234 int i, type, prev; 1388 int i, type, prev;
1235 int err; 1389 int err;
1236 1390
1237 if (!capable(CAP_SYS_ADMIN)) 1391 if (!capable(CAP_SYS_ADMIN))
1238 return -EPERM; 1392 return -EPERM;
1239 1393
@@ -1253,7 +1407,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1253 spin_lock(&swap_lock); 1407 spin_lock(&swap_lock);
1254 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1408 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1255 p = swap_info + type; 1409 p = swap_info + type;
1256 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 1410 if (p->flags & SWP_WRITEOK) {
1257 if (p->swap_file->f_mapping == mapping) 1411 if (p->swap_file->f_mapping == mapping)
1258 break; 1412 break;
1259 } 1413 }
@@ -1343,6 +1497,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1343 spin_unlock(&swap_lock); 1497 spin_unlock(&swap_lock);
1344 mutex_unlock(&swapon_mutex); 1498 mutex_unlock(&swapon_mutex);
1345 vfree(swap_map); 1499 vfree(swap_map);
1500 /* Destroy swap account informatin */
1501 swap_cgroup_swapoff(type);
1502
1346 inode = mapping->host; 1503 inode = mapping->host;
1347 if (S_ISBLK(inode->i_mode)) { 1504 if (S_ISBLK(inode->i_mode)) {
1348 struct block_device *bdev = I_BDEV(inode); 1505 struct block_device *bdev = I_BDEV(inode);
@@ -1426,12 +1583,12 @@ static int swap_show(struct seq_file *swap, void *v)
1426 file = ptr->swap_file; 1583 file = ptr->swap_file;
1427 len = seq_path(swap, &file->f_path, " \t\n\\"); 1584 len = seq_path(swap, &file->f_path, " \t\n\\");
1428 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1585 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1429 len < 40 ? 40 - len : 1, " ", 1586 len < 40 ? 40 - len : 1, " ",
1430 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1587 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1431 "partition" : "file\t", 1588 "partition" : "file\t",
1432 ptr->pages << (PAGE_SHIFT - 10), 1589 ptr->pages << (PAGE_SHIFT - 10),
1433 ptr->inuse_pages << (PAGE_SHIFT - 10), 1590 ptr->inuse_pages << (PAGE_SHIFT - 10),
1434 ptr->prio); 1591 ptr->prio);
1435 return 0; 1592 return 0;
1436} 1593}
1437 1594
@@ -1487,12 +1644,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1487 int i, prev; 1644 int i, prev;
1488 int error; 1645 int error;
1489 union swap_header *swap_header = NULL; 1646 union swap_header *swap_header = NULL;
1490 int swap_header_version;
1491 unsigned int nr_good_pages = 0; 1647 unsigned int nr_good_pages = 0;
1492 int nr_extents = 0; 1648 int nr_extents = 0;
1493 sector_t span; 1649 sector_t span;
1494 unsigned long maxpages = 1; 1650 unsigned long maxpages = 1;
1495 int swapfilesize; 1651 unsigned long swapfilepages;
1496 unsigned short *swap_map = NULL; 1652 unsigned short *swap_map = NULL;
1497 struct page *page = NULL; 1653 struct page *page = NULL;
1498 struct inode *inode = NULL; 1654 struct inode *inode = NULL;
@@ -1570,7 +1726,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1570 goto bad_swap; 1726 goto bad_swap;
1571 } 1727 }
1572 1728
1573 swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 1729 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1574 1730
1575 /* 1731 /*
1576 * Read the swap header. 1732 * Read the swap header.
@@ -1584,102 +1740,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1584 error = PTR_ERR(page); 1740 error = PTR_ERR(page);
1585 goto bad_swap; 1741 goto bad_swap;
1586 } 1742 }
1587 kmap(page); 1743 swap_header = kmap(page);
1588 swap_header = page_address(page);
1589 1744
1590 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 1745 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1591 swap_header_version = 1;
1592 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1593 swap_header_version = 2;
1594 else {
1595 printk(KERN_ERR "Unable to find swap-space signature\n"); 1746 printk(KERN_ERR "Unable to find swap-space signature\n");
1596 error = -EINVAL; 1747 error = -EINVAL;
1597 goto bad_swap; 1748 goto bad_swap;
1598 } 1749 }
1599 1750
1600 switch (swap_header_version) { 1751 /* swap partition endianess hack... */
1601 case 1: 1752 if (swab32(swap_header->info.version) == 1) {
1602 printk(KERN_ERR "version 0 swap is no longer supported. " 1753 swab32s(&swap_header->info.version);
1603 "Use mkswap -v1 %s\n", name); 1754 swab32s(&swap_header->info.last_page);
1755 swab32s(&swap_header->info.nr_badpages);
1756 for (i = 0; i < swap_header->info.nr_badpages; i++)
1757 swab32s(&swap_header->info.badpages[i]);
1758 }
1759 /* Check the swap header's sub-version */
1760 if (swap_header->info.version != 1) {
1761 printk(KERN_WARNING
1762 "Unable to handle swap header version %d\n",
1763 swap_header->info.version);
1604 error = -EINVAL; 1764 error = -EINVAL;
1605 goto bad_swap; 1765 goto bad_swap;
1606 case 2: 1766 }
1607 /* swap partition endianess hack... */
1608 if (swab32(swap_header->info.version) == 1) {
1609 swab32s(&swap_header->info.version);
1610 swab32s(&swap_header->info.last_page);
1611 swab32s(&swap_header->info.nr_badpages);
1612 for (i = 0; i < swap_header->info.nr_badpages; i++)
1613 swab32s(&swap_header->info.badpages[i]);
1614 }
1615 /* Check the swap header's sub-version and the size of
1616 the swap file and bad block lists */
1617 if (swap_header->info.version != 1) {
1618 printk(KERN_WARNING
1619 "Unable to handle swap header version %d\n",
1620 swap_header->info.version);
1621 error = -EINVAL;
1622 goto bad_swap;
1623 }
1624 1767
1625 p->lowest_bit = 1; 1768 p->lowest_bit = 1;
1626 p->cluster_next = 1; 1769 p->cluster_next = 1;
1627 1770
1628 /* 1771 /*
1629 * Find out how many pages are allowed for a single swap 1772 * Find out how many pages are allowed for a single swap
1630 * device. There are two limiting factors: 1) the number of 1773 * device. There are two limiting factors: 1) the number of
1631 * bits for the swap offset in the swp_entry_t type and 1774 * bits for the swap offset in the swp_entry_t type and
1632 * 2) the number of bits in the a swap pte as defined by 1775 * 2) the number of bits in the a swap pte as defined by
1633 * the different architectures. In order to find the 1776 * the different architectures. In order to find the
1634 * largest possible bit mask a swap entry with swap type 0 1777 * largest possible bit mask a swap entry with swap type 0
1635 * and swap offset ~0UL is created, encoded to a swap pte, 1778 * and swap offset ~0UL is created, encoded to a swap pte,
1636 * decoded to a swp_entry_t again and finally the swap 1779 * decoded to a swp_entry_t again and finally the swap
1637 * offset is extracted. This will mask all the bits from 1780 * offset is extracted. This will mask all the bits from
1638 * the initial ~0UL mask that can't be encoded in either 1781 * the initial ~0UL mask that can't be encoded in either
1639 * the swp_entry_t or the architecture definition of a 1782 * the swp_entry_t or the architecture definition of a
1640 * swap pte. 1783 * swap pte.
1641 */ 1784 */
1642 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 1785 maxpages = swp_offset(pte_to_swp_entry(
1643 if (maxpages > swap_header->info.last_page) 1786 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
1644 maxpages = swap_header->info.last_page; 1787 if (maxpages > swap_header->info.last_page)
1645 p->highest_bit = maxpages - 1; 1788 maxpages = swap_header->info.last_page;
1789 p->highest_bit = maxpages - 1;
1646 1790
1647 error = -EINVAL; 1791 error = -EINVAL;
1648 if (!maxpages) 1792 if (!maxpages)
1649 goto bad_swap; 1793 goto bad_swap;
1650 if (swapfilesize && maxpages > swapfilesize) { 1794 if (swapfilepages && maxpages > swapfilepages) {
1651 printk(KERN_WARNING 1795 printk(KERN_WARNING
1652 "Swap area shorter than signature indicates\n"); 1796 "Swap area shorter than signature indicates\n");
1653 goto bad_swap; 1797 goto bad_swap;
1654 } 1798 }
1655 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1799 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1656 goto bad_swap; 1800 goto bad_swap;
1657 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1801 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1658 goto bad_swap; 1802 goto bad_swap;
1659 1803
1660 /* OK, set up the swap map and apply the bad block list */ 1804 /* OK, set up the swap map and apply the bad block list */
1661 swap_map = vmalloc(maxpages * sizeof(short)); 1805 swap_map = vmalloc(maxpages * sizeof(short));
1662 if (!swap_map) { 1806 if (!swap_map) {
1663 error = -ENOMEM; 1807 error = -ENOMEM;
1664 goto bad_swap; 1808 goto bad_swap;
1665 } 1809 }
1666 1810
1667 error = 0; 1811 memset(swap_map, 0, maxpages * sizeof(short));
1668 memset(swap_map, 0, maxpages * sizeof(short)); 1812 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1669 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1813 int page_nr = swap_header->info.badpages[i];
1670 int page_nr = swap_header->info.badpages[i]; 1814 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
1671 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1815 error = -EINVAL;
1672 error = -EINVAL;
1673 else
1674 swap_map[page_nr] = SWAP_MAP_BAD;
1675 }
1676 nr_good_pages = swap_header->info.last_page -
1677 swap_header->info.nr_badpages -
1678 1 /* header page */;
1679 if (error)
1680 goto bad_swap; 1816 goto bad_swap;
1817 }
1818 swap_map[page_nr] = SWAP_MAP_BAD;
1681 } 1819 }
1682 1820
1821 error = swap_cgroup_swapon(type, maxpages);
1822 if (error)
1823 goto bad_swap;
1824
1825 nr_good_pages = swap_header->info.last_page -
1826 swap_header->info.nr_badpages -
1827 1 /* header page */;
1828
1683 if (nr_good_pages) { 1829 if (nr_good_pages) {
1684 swap_map[0] = SWAP_MAP_BAD; 1830 swap_map[0] = SWAP_MAP_BAD;
1685 p->max = maxpages; 1831 p->max = maxpages;
@@ -1697,6 +1843,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1697 goto bad_swap; 1843 goto bad_swap;
1698 } 1844 }
1699 1845
1846 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1847 p->flags |= SWP_SOLIDSTATE;
1848 p->cluster_next = 1 + (random32() % p->highest_bit);
1849 }
1850 if (discard_swap(p) == 0)
1851 p->flags |= SWP_DISCARDABLE;
1852
1700 mutex_lock(&swapon_mutex); 1853 mutex_lock(&swapon_mutex);
1701 spin_lock(&swap_lock); 1854 spin_lock(&swap_lock);
1702 if (swap_flags & SWAP_FLAG_PREFER) 1855 if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1858,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1705 else 1858 else
1706 p->prio = --least_priority; 1859 p->prio = --least_priority;
1707 p->swap_map = swap_map; 1860 p->swap_map = swap_map;
1708 p->flags = SWP_ACTIVE; 1861 p->flags |= SWP_WRITEOK;
1709 nr_swap_pages += nr_good_pages; 1862 nr_swap_pages += nr_good_pages;
1710 total_swap_pages += nr_good_pages; 1863 total_swap_pages += nr_good_pages;
1711 1864
1712 printk(KERN_INFO "Adding %uk swap on %s. " 1865 printk(KERN_INFO "Adding %uk swap on %s. "
1713 "Priority:%d extents:%d across:%lluk\n", 1866 "Priority:%d extents:%d across:%lluk %s%s\n",
1714 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 1867 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1715 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); 1868 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
1869 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
1870 (p->flags & SWP_DISCARDABLE) ? "D" : "");
1716 1871
1717 /* insert swap space into swap_list: */ 1872 /* insert swap space into swap_list: */
1718 prev = -1; 1873 prev = -1;
@@ -1738,6 +1893,7 @@ bad_swap:
1738 bd_release(bdev); 1893 bd_release(bdev);
1739 } 1894 }
1740 destroy_swap_extents(p); 1895 destroy_swap_extents(p);
1896 swap_cgroup_swapoff(type);
1741bad_swap_2: 1897bad_swap_2:
1742 spin_lock(&swap_lock); 1898 spin_lock(&swap_lock);
1743 p->swap_file = NULL; 1899 p->swap_file = NULL;