aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c918
1 files changed, 594 insertions, 324 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c590eef7912..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/ksm.h>
25#include <linux/rmap.h> 26#include <linux/rmap.h>
26#include <linux/security.h> 27#include <linux/security.h>
27#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
@@ -35,11 +36,15 @@
35#include <linux/swapops.h> 36#include <linux/swapops.h>
36#include <linux/page_cgroup.h> 37#include <linux/page_cgroup.h>
37 38
39static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
40 unsigned char);
41static void free_swap_count_continuations(struct swap_info_struct *);
42static sector_t map_swap_entry(swp_entry_t, struct block_device**);
43
38static DEFINE_SPINLOCK(swap_lock); 44static DEFINE_SPINLOCK(swap_lock);
39static unsigned int nr_swapfiles; 45static unsigned int nr_swapfiles;
40long nr_swap_pages; 46long nr_swap_pages;
41long total_swap_pages; 47long total_swap_pages;
42static int swap_overflow;
43static int least_priority; 48static int least_priority;
44 49
45static const char Bad_file[] = "Bad swap file entry "; 50static const char Bad_file[] = "Bad swap file entry ";
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry ";
49 54
50static struct swap_list_t swap_list = {-1, -1}; 55static struct swap_list_t swap_list = {-1, -1};
51 56
52static struct swap_info_struct swap_info[MAX_SWAPFILES]; 57static struct swap_info_struct *swap_info[MAX_SWAPFILES];
53 58
54static DEFINE_MUTEX(swapon_mutex); 59static DEFINE_MUTEX(swapon_mutex);
55 60
56/* For reference count accounting in swap_map */ 61static inline unsigned char swap_count(unsigned char ent)
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{ 62{
75 unsigned short ret = count; 63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80} 64}
81 65
82/* returnes 1 if swap entry is freed */ 66/* returns 1 if swap entry is freed */
83static int 67static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 68__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{ 69{
86 int type = si - swap_info; 70 swp_entry_t entry = swp_entry(si->type, offset);
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page; 71 struct page *page;
89 int ret = 0; 72 int ret = 0;
90 73
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
120 down_read(&swap_unplug_sem); 103 down_read(&swap_unplug_sem);
121 entry.val = page_private(page); 104 entry.val = page_private(page);
122 if (PageSwapCache(page)) { 105 if (PageSwapCache(page)) {
123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 106 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
124 struct backing_dev_info *bdi; 107 struct backing_dev_info *bdi;
125 108
126 /* 109 /*
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
146static int discard_swap(struct swap_info_struct *si) 129static int discard_swap(struct swap_info_struct *si)
147{ 130{
148 struct swap_extent *se; 131 struct swap_extent *se;
132 sector_t start_block;
133 sector_t nr_blocks;
149 int err = 0; 134 int err = 0;
150 135
151 list_for_each_entry(se, &si->extent_list, list) { 136 /* Do not discard the swap header page! */
152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 137 se = &si->first_swap_extent;
153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 138 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
143 if (err)
144 return err;
145 cond_resched();
146 }
154 147
155 if (se->start_page == 0) { 148 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 /* Do not discard the swap header page! */ 149 start_block = se->start_block << (PAGE_SHIFT - 9);
157 start_block += 1 << (PAGE_SHIFT - 9); 150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158 nr_blocks -= 1 << (PAGE_SHIFT - 9);
159 if (!nr_blocks)
160 continue;
161 }
162 151
163 err = blkdev_issue_discard(si->bdev, start_block, 152 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL, 153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
165 DISCARD_FL_BARRIER);
166 if (err) 154 if (err)
167 break; 155 break;
168 156
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
201 start_block <<= PAGE_SHIFT - 9; 189 start_block <<= PAGE_SHIFT - 9;
202 nr_blocks <<= PAGE_SHIFT - 9; 190 nr_blocks <<= PAGE_SHIFT - 9;
203 if (blkdev_issue_discard(si->bdev, start_block, 191 if (blkdev_issue_discard(si->bdev, start_block,
204 nr_blocks, GFP_NOIO, 192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
205 DISCARD_FL_BARRIER))
206 break; 193 break;
207 } 194 }
208 195
209 lh = se->list.next; 196 lh = se->list.next;
210 if (lh == &si->extent_list)
211 lh = lh->next;
212 se = list_entry(lh, struct swap_extent, list); 197 se = list_entry(lh, struct swap_extent, list);
213 } 198 }
214} 199}
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word)
223#define LATENCY_LIMIT 256 208#define LATENCY_LIMIT 256
224 209
225static inline unsigned long scan_swap_map(struct swap_info_struct *si, 210static inline unsigned long scan_swap_map(struct swap_info_struct *si,
226 int cache) 211 unsigned char usage)
227{ 212{
228 unsigned long offset; 213 unsigned long offset;
229 unsigned long scan_base; 214 unsigned long scan_base;
@@ -354,10 +339,7 @@ checks:
354 si->lowest_bit = si->max; 339 si->lowest_bit = si->max;
355 si->highest_bit = 0; 340 si->highest_bit = 0;
356 } 341 }
357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 342 si->swap_map[offset] = usage;
358 si->swap_map[offset] = encode_swapmap(0, true);
359 else /* at suspend */
360 si->swap_map[offset] = encode_swapmap(1, false);
361 si->cluster_next = offset + 1; 343 si->cluster_next = offset + 1;
362 si->flags -= SWP_SCANNING; 344 si->flags -= SWP_SCANNING;
363 345
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void)
467 nr_swap_pages--; 449 nr_swap_pages--;
468 450
469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 451 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
470 si = swap_info + type; 452 si = swap_info[type];
471 next = si->next; 453 next = si->next;
472 if (next < 0 || 454 if (next < 0 ||
473 (!wrapped && si->prio != swap_info[next].prio)) { 455 (!wrapped && si->prio != swap_info[next]->prio)) {
474 next = swap_list.head; 456 next = swap_list.head;
475 wrapped++; 457 wrapped++;
476 } 458 }
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void)
482 464
483 swap_list.next = next; 465 swap_list.next = next;
484 /* This is called for allocating swap entry for cache */ 466 /* This is called for allocating swap entry for cache */
485 offset = scan_swap_map(si, SWAP_CACHE); 467 offset = scan_swap_map(si, SWAP_HAS_CACHE);
486 if (offset) { 468 if (offset) {
487 spin_unlock(&swap_lock); 469 spin_unlock(&swap_lock);
488 return swp_entry(type, offset); 470 return swp_entry(type, offset);
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type)
503 pgoff_t offset; 485 pgoff_t offset;
504 486
505 spin_lock(&swap_lock); 487 spin_lock(&swap_lock);
506 si = swap_info + type; 488 si = swap_info[type];
507 if (si->flags & SWP_WRITEOK) { 489 if (si && (si->flags & SWP_WRITEOK)) {
508 nr_swap_pages--; 490 nr_swap_pages--;
509 /* This is called for allocating swap entry, not cache */ 491 /* This is called for allocating swap entry, not cache */
510 offset = scan_swap_map(si, SWAP_MAP); 492 offset = scan_swap_map(si, 1);
511 if (offset) { 493 if (offset) {
512 spin_unlock(&swap_lock); 494 spin_unlock(&swap_lock);
513 return swp_entry(type, offset); 495 return swp_entry(type, offset);
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type)
518 return (swp_entry_t) {0}; 500 return (swp_entry_t) {0};
519} 501}
520 502
521static struct swap_info_struct * swap_info_get(swp_entry_t entry) 503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
522{ 504{
523 struct swap_info_struct * p; 505 struct swap_info_struct *p;
524 unsigned long offset, type; 506 unsigned long offset, type;
525 507
526 if (!entry.val) 508 if (!entry.val)
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
528 type = swp_type(entry); 510 type = swp_type(entry);
529 if (type >= nr_swapfiles) 511 if (type >= nr_swapfiles)
530 goto bad_nofile; 512 goto bad_nofile;
531 p = & swap_info[type]; 513 p = swap_info[type];
532 if (!(p->flags & SWP_USED)) 514 if (!(p->flags & SWP_USED))
533 goto bad_device; 515 goto bad_device;
534 offset = swp_offset(entry); 516 offset = swp_offset(entry);
@@ -554,41 +536,56 @@ out:
554 return NULL; 536 return NULL;
555} 537}
556 538
557static int swap_entry_free(struct swap_info_struct *p, 539static unsigned char swap_entry_free(struct swap_info_struct *p,
558 swp_entry_t ent, int cache) 540 swp_entry_t entry, unsigned char usage)
559{ 541{
560 unsigned long offset = swp_offset(ent); 542 unsigned long offset = swp_offset(entry);
561 int count = swap_count(p->swap_map[offset]); 543 unsigned char count;
562 bool has_cache; 544 unsigned char has_cache;
563 545
564 has_cache = swap_has_cache(p->swap_map[offset]); 546 count = p->swap_map[offset];
547 has_cache = count & SWAP_HAS_CACHE;
548 count &= ~SWAP_HAS_CACHE;
565 549
566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 550 if (usage == SWAP_HAS_CACHE) {
567 if (count < SWAP_MAP_MAX) {
568 count--;
569 p->swap_map[offset] = encode_swapmap(count, has_cache);
570 }
571 } else { /* dropping swap cache flag */
572 VM_BUG_ON(!has_cache); 551 VM_BUG_ON(!has_cache);
573 p->swap_map[offset] = encode_swapmap(count, false); 552 has_cache = 0;
574 553 } else if (count == SWAP_MAP_SHMEM) {
554 /*
555 * Or we could insist on shmem.c using a special
556 * swap_shmem_free() and free_shmem_swap_and_cache()...
557 */
558 count = 0;
559 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
560 if (count == COUNT_CONTINUED) {
561 if (swap_count_continued(p, offset, count))
562 count = SWAP_MAP_MAX | COUNT_CONTINUED;
563 else
564 count = SWAP_MAP_MAX;
565 } else
566 count--;
575 } 567 }
576 /* return code. */ 568
577 count = p->swap_map[offset]; 569 if (!count)
570 mem_cgroup_uncharge_swap(entry);
571
572 usage = count | has_cache;
573 p->swap_map[offset] = usage;
574
578 /* free if no reference */ 575 /* free if no reference */
579 if (!count) { 576 if (!usage) {
580 if (offset < p->lowest_bit) 577 if (offset < p->lowest_bit)
581 p->lowest_bit = offset; 578 p->lowest_bit = offset;
582 if (offset > p->highest_bit) 579 if (offset > p->highest_bit)
583 p->highest_bit = offset; 580 p->highest_bit = offset;
584 if (p->prio > swap_info[swap_list.next].prio) 581 if (swap_list.next >= 0 &&
585 swap_list.next = p - swap_info; 582 p->prio > swap_info[swap_list.next]->prio)
583 swap_list.next = p->type;
586 nr_swap_pages++; 584 nr_swap_pages++;
587 p->inuse_pages--; 585 p->inuse_pages--;
588 } 586 }
589 if (!swap_count(count)) 587
590 mem_cgroup_uncharge_swap(ent); 588 return usage;
591 return count;
592} 589}
593 590
594/* 591/*
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p,
597 */ 594 */
598void swap_free(swp_entry_t entry) 595void swap_free(swp_entry_t entry)
599{ 596{
600 struct swap_info_struct * p; 597 struct swap_info_struct *p;
601 598
602 p = swap_info_get(entry); 599 p = swap_info_get(entry);
603 if (p) { 600 if (p) {
604 swap_entry_free(p, entry, SWAP_MAP); 601 swap_entry_free(p, entry, 1);
605 spin_unlock(&swap_lock); 602 spin_unlock(&swap_lock);
606 } 603 }
607} 604}
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry)
612void swapcache_free(swp_entry_t entry, struct page *page) 609void swapcache_free(swp_entry_t entry, struct page *page)
613{ 610{
614 struct swap_info_struct *p; 611 struct swap_info_struct *p;
615 int ret; 612 unsigned char count;
616 613
617 p = swap_info_get(entry); 614 p = swap_info_get(entry);
618 if (p) { 615 if (p) {
619 ret = swap_entry_free(p, entry, SWAP_CACHE); 616 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
620 if (page) { 617 if (page)
621 bool swapout; 618 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
622 if (ret)
623 swapout = true; /* the end of swap out */
624 else
625 swapout = false; /* no more swap users! */
626 mem_cgroup_uncharge_swapcache(page, entry, swapout);
627 }
628 spin_unlock(&swap_lock); 619 spin_unlock(&swap_lock);
629 } 620 }
630 return;
631} 621}
632 622
633/* 623/*
634 * How many references to page are currently swapped out? 624 * How many references to page are currently swapped out?
625 * This does not give an exact answer when swap count is continued,
626 * but does include the high COUNT_CONTINUED flag to allow for that.
635 */ 627 */
636static inline int page_swapcount(struct page *page) 628static inline int page_swapcount(struct page *page)
637{ 629{
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page)
659 int count; 651 int count;
660 652
661 VM_BUG_ON(!PageLocked(page)); 653 VM_BUG_ON(!PageLocked(page));
654 if (unlikely(PageKsm(page)))
655 return 0;
662 count = page_mapcount(page); 656 count = page_mapcount(page);
663 if (count <= 1 && PageSwapCache(page)) { 657 if (count <= 1 && PageSwapCache(page)) {
664 count += page_swapcount(page); 658 count += page_swapcount(page);
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page)
667 SetPageDirty(page); 661 SetPageDirty(page);
668 } 662 }
669 } 663 }
670 return count == 1; 664 return count <= 1;
671} 665}
672 666
673/* 667/*
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry)
704 698
705 p = swap_info_get(entry); 699 p = swap_info_get(entry);
706 if (p) { 700 if (p) {
707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
708 page = find_get_page(&swapper_space, entry.val); 702 page = find_get_page(&swapper_space, entry.val);
709 if (page && !trylock_page(page)) { 703 if (page && !trylock_page(page)) {
710 page_cache_release(page); 704 page_cache_release(page);
@@ -729,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
729 return p != NULL; 723 return p != NULL;
730} 724}
731 725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739 struct page *page;
740 struct swap_info_struct *p;
741 int count = 0;
742
743 page = find_get_page(&swapper_space, ent.val);
744 if (page)
745 count += page_mapcount(page);
746 p = swap_info_get(ent);
747 if (p) {
748 count += swap_count(p->swap_map[swp_offset(ent)]);
749 spin_unlock(&swap_lock);
750 }
751
752 *pagep = page;
753 return count;
754}
755#endif
756
732#ifdef CONFIG_HIBERNATION 757#ifdef CONFIG_HIBERNATION
733/* 758/*
734 * Find the swap type that corresponds to given device (if any). 759 * Find the swap type that corresponds to given device (if any).
@@ -741,14 +766,14 @@ int free_swap_and_cache(swp_entry_t entry)
741int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 766int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
742{ 767{
743 struct block_device *bdev = NULL; 768 struct block_device *bdev = NULL;
744 int i; 769 int type;
745 770
746 if (device) 771 if (device)
747 bdev = bdget(device); 772 bdev = bdget(device);
748 773
749 spin_lock(&swap_lock); 774 spin_lock(&swap_lock);
750 for (i = 0; i < nr_swapfiles; i++) { 775 for (type = 0; type < nr_swapfiles; type++) {
751 struct swap_info_struct *sis = swap_info + i; 776 struct swap_info_struct *sis = swap_info[type];
752 777
753 if (!(sis->flags & SWP_WRITEOK)) 778 if (!(sis->flags & SWP_WRITEOK))
754 continue; 779 continue;
@@ -758,20 +783,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
758 *bdev_p = bdgrab(sis->bdev); 783 *bdev_p = bdgrab(sis->bdev);
759 784
760 spin_unlock(&swap_lock); 785 spin_unlock(&swap_lock);
761 return i; 786 return type;
762 } 787 }
763 if (bdev == sis->bdev) { 788 if (bdev == sis->bdev) {
764 struct swap_extent *se; 789 struct swap_extent *se = &sis->first_swap_extent;
765 790
766 se = list_entry(sis->extent_list.next,
767 struct swap_extent, list);
768 if (se->start_block == offset) { 791 if (se->start_block == offset) {
769 if (bdev_p) 792 if (bdev_p)
770 *bdev_p = bdgrab(sis->bdev); 793 *bdev_p = bdgrab(sis->bdev);
771 794
772 spin_unlock(&swap_lock); 795 spin_unlock(&swap_lock);
773 bdput(bdev); 796 bdput(bdev);
774 return i; 797 return type;
775 } 798 }
776 } 799 }
777 } 800 }
@@ -783,6 +806,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
783} 806}
784 807
785/* 808/*
809 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
810 * corresponding to given index in swap_info (swap type).
811 */
812sector_t swapdev_block(int type, pgoff_t offset)
813{
814 struct block_device *bdev;
815
816 if ((unsigned int)type >= nr_swapfiles)
817 return 0;
818 if (!(swap_info[type]->flags & SWP_WRITEOK))
819 return 0;
820 return map_swap_entry(swp_entry(type, offset), &bdev);
821}
822
823/*
786 * Return either the total number of swap pages of given type, or the number 824 * Return either the total number of swap pages of given type, or the number
787 * of free pages of that type (depending on @free) 825 * of free pages of that type (depending on @free)
788 * 826 *
@@ -792,18 +830,20 @@ unsigned int count_swap_pages(int type, int free)
792{ 830{
793 unsigned int n = 0; 831 unsigned int n = 0;
794 832
795 if (type < nr_swapfiles) { 833 spin_lock(&swap_lock);
796 spin_lock(&swap_lock); 834 if ((unsigned int)type < nr_swapfiles) {
797 if (swap_info[type].flags & SWP_WRITEOK) { 835 struct swap_info_struct *sis = swap_info[type];
798 n = swap_info[type].pages; 836
837 if (sis->flags & SWP_WRITEOK) {
838 n = sis->pages;
799 if (free) 839 if (free)
800 n -= swap_info[type].inuse_pages; 840 n -= sis->inuse_pages;
801 } 841 }
802 spin_unlock(&swap_lock);
803 } 842 }
843 spin_unlock(&swap_lock);
804 return n; 844 return n;
805} 845}
806#endif 846#endif /* CONFIG_HIBERNATION */
807 847
808/* 848/*
809 * No need to decide whether this PTE shares the swap entry with others, 849 * No need to decide whether this PTE shares the swap entry with others,
@@ -831,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
831 goto out; 871 goto out;
832 } 872 }
833 873
834 inc_mm_counter(vma->vm_mm, anon_rss); 874 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
875 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
835 get_page(page); 876 get_page(page);
836 set_pte_at(vma->vm_mm, addr, pte, 877 set_pte_at(vma->vm_mm, addr, pte,
837 pte_mkold(mk_pte(page, vma->vm_page_prot))); 878 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -932,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma,
932 unsigned long addr, end, next; 973 unsigned long addr, end, next;
933 int ret; 974 int ret;
934 975
935 if (page->mapping) { 976 if (page_anon_vma(page)) {
936 addr = page_address_in_vma(page, vma); 977 addr = page_address_in_vma(page, vma);
937 if (addr == -EFAULT) 978 if (addr == -EFAULT)
938 return 0; 979 return 0;
@@ -988,7 +1029,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
988{ 1029{
989 unsigned int max = si->max; 1030 unsigned int max = si->max;
990 unsigned int i = prev; 1031 unsigned int i = prev;
991 int count; 1032 unsigned char count;
992 1033
993 /* 1034 /*
994 * No need for swap_lock here: we're just looking 1035 * No need for swap_lock here: we're just looking
@@ -1024,16 +1065,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1024 */ 1065 */
1025static int try_to_unuse(unsigned int type) 1066static int try_to_unuse(unsigned int type)
1026{ 1067{
1027 struct swap_info_struct * si = &swap_info[type]; 1068 struct swap_info_struct *si = swap_info[type];
1028 struct mm_struct *start_mm; 1069 struct mm_struct *start_mm;
1029 unsigned short *swap_map; 1070 unsigned char *swap_map;
1030 unsigned short swcount; 1071 unsigned char swcount;
1031 struct page *page; 1072 struct page *page;
1032 swp_entry_t entry; 1073 swp_entry_t entry;
1033 unsigned int i = 0; 1074 unsigned int i = 0;
1034 int retval = 0; 1075 int retval = 0;
1035 int reset_overflow = 0;
1036 int shmem;
1037 1076
1038 /* 1077 /*
1039 * When searching mms for an entry, a good strategy is to 1078 * When searching mms for an entry, a good strategy is to
@@ -1047,8 +1086,7 @@ static int try_to_unuse(unsigned int type)
1047 * together, child after parent. If we race with dup_mmap(), we 1086 * together, child after parent. If we race with dup_mmap(), we
1048 * prefer to resolve parent before child, lest we miss entries 1087 * prefer to resolve parent before child, lest we miss entries
1049 * duplicated after we scanned child: using last mm would invert 1088 * duplicated after we scanned child: using last mm would invert
1050 * that. Though it's only a serious concern when an overflowed 1089 * that.
1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1052 */ 1090 */
1053 start_mm = &init_mm; 1091 start_mm = &init_mm;
1054 atomic_inc(&init_mm.mm_users); 1092 atomic_inc(&init_mm.mm_users);
@@ -1110,17 +1148,18 @@ static int try_to_unuse(unsigned int type)
1110 1148
1111 /* 1149 /*
1112 * Remove all references to entry. 1150 * Remove all references to entry.
1113 * Whenever we reach init_mm, there's no address space
1114 * to search, but use it as a reminder to search shmem.
1115 */ 1151 */
1116 shmem = 0;
1117 swcount = *swap_map; 1152 swcount = *swap_map;
1118 if (swap_count(swcount)) { 1153 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1119 if (start_mm == &init_mm) 1154 retval = shmem_unuse(entry, page);
1120 shmem = shmem_unuse(entry, page); 1155 /* page has already been unlocked and released */
1121 else 1156 if (retval < 0)
1122 retval = unuse_mm(start_mm, entry, page); 1157 break;
1158 continue;
1123 } 1159 }
1160 if (swap_count(swcount) && start_mm != &init_mm)
1161 retval = unuse_mm(start_mm, entry, page);
1162
1124 if (swap_count(*swap_map)) { 1163 if (swap_count(*swap_map)) {
1125 int set_start_mm = (*swap_map >= swcount); 1164 int set_start_mm = (*swap_map >= swcount);
1126 struct list_head *p = &start_mm->mmlist; 1165 struct list_head *p = &start_mm->mmlist;
@@ -1131,7 +1170,7 @@ static int try_to_unuse(unsigned int type)
1131 atomic_inc(&new_start_mm->mm_users); 1170 atomic_inc(&new_start_mm->mm_users);
1132 atomic_inc(&prev_mm->mm_users); 1171 atomic_inc(&prev_mm->mm_users);
1133 spin_lock(&mmlist_lock); 1172 spin_lock(&mmlist_lock);
1134 while (swap_count(*swap_map) && !retval && !shmem && 1173 while (swap_count(*swap_map) && !retval &&
1135 (p = p->next) != &start_mm->mmlist) { 1174 (p = p->next) != &start_mm->mmlist) {
1136 mm = list_entry(p, struct mm_struct, mmlist); 1175 mm = list_entry(p, struct mm_struct, mmlist);
1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1176 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1145,10 +1184,9 @@ static int try_to_unuse(unsigned int type)
1145 swcount = *swap_map; 1184 swcount = *swap_map;
1146 if (!swap_count(swcount)) /* any usage ? */ 1185 if (!swap_count(swcount)) /* any usage ? */
1147 ; 1186 ;
1148 else if (mm == &init_mm) { 1187 else if (mm == &init_mm)
1149 set_start_mm = 1; 1188 set_start_mm = 1;
1150 shmem = shmem_unuse(entry, page); 1189 else
1151 } else
1152 retval = unuse_mm(mm, entry, page); 1190 retval = unuse_mm(mm, entry, page);
1153 1191
1154 if (set_start_mm && *swap_map < swcount) { 1192 if (set_start_mm && *swap_map < swcount) {
@@ -1164,13 +1202,6 @@ static int try_to_unuse(unsigned int type)
1164 mmput(start_mm); 1202 mmput(start_mm);
1165 start_mm = new_start_mm; 1203 start_mm = new_start_mm;
1166 } 1204 }
1167 if (shmem) {
1168 /* page has already been unlocked and released */
1169 if (shmem > 0)
1170 continue;
1171 retval = shmem;
1172 break;
1173 }
1174 if (retval) { 1205 if (retval) {
1175 unlock_page(page); 1206 unlock_page(page);
1176 page_cache_release(page); 1207 page_cache_release(page);
@@ -1178,30 +1209,6 @@ static int try_to_unuse(unsigned int type)
1178 } 1209 }
1179 1210
1180 /* 1211 /*
1181 * How could swap count reach 0x7ffe ?
1182 * There's no way to repeat a swap page within an mm
1183 * (except in shmem, where it's the shared object which takes
1184 * the reference count)?
1185 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1186 * short is too small....)
1187 * If that's wrong, then we should worry more about
1188 * exit_mmap() and do_munmap() cases described above:
1189 * we might be resetting SWAP_MAP_MAX too early here.
1190 * We know "Undead"s can happen, they're okay, so don't
1191 * report them; but do report if we reset SWAP_MAP_MAX.
1192 */
1193 /* We might release the lock_page() in unuse_mm(). */
1194 if (!PageSwapCache(page) || page_private(page) != entry.val)
1195 goto retry;
1196
1197 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1198 spin_lock(&swap_lock);
1199 *swap_map = encode_swapmap(0, true);
1200 spin_unlock(&swap_lock);
1201 reset_overflow = 1;
1202 }
1203
1204 /*
1205 * If a reference remains (rare), we would like to leave 1212 * If a reference remains (rare), we would like to leave
1206 * the page in the swap cache; but try_to_unmap could 1213 * the page in the swap cache; but try_to_unmap could
1207 * then re-duplicate the entry once we drop page lock, 1214 * then re-duplicate the entry once we drop page lock,
@@ -1213,6 +1220,12 @@ static int try_to_unuse(unsigned int type)
1213 * read from disk into another page. Splitting into two 1220 * read from disk into another page. Splitting into two
1214 * pages would be incorrect if swap supported "shared 1221 * pages would be incorrect if swap supported "shared
1215 * private" pages, but they are handled by tmpfs files. 1222 * private" pages, but they are handled by tmpfs files.
1223 *
1224 * Given how unuse_vma() targets one particular offset
1225 * in an anon_vma, once the anon_vma has been determined,
1226 * this splitting happens to be just what is needed to
1227 * handle where KSM pages have been swapped out: re-reading
1228 * is unnecessarily slow, but we can fix that later on.
1216 */ 1229 */
1217 if (swap_count(*swap_map) && 1230 if (swap_count(*swap_map) &&
1218 PageDirty(page) && PageSwapCache(page)) { 1231 PageDirty(page) && PageSwapCache(page)) {
@@ -1242,7 +1255,6 @@ static int try_to_unuse(unsigned int type)
1242 * mark page dirty so shrink_page_list will preserve it. 1255 * mark page dirty so shrink_page_list will preserve it.
1243 */ 1256 */
1244 SetPageDirty(page); 1257 SetPageDirty(page);
1245retry:
1246 unlock_page(page); 1258 unlock_page(page);
1247 page_cache_release(page); 1259 page_cache_release(page);
1248 1260
@@ -1254,10 +1266,6 @@ retry:
1254 } 1266 }
1255 1267
1256 mmput(start_mm); 1268 mmput(start_mm);
1257 if (reset_overflow) {
1258 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1259 swap_overflow = 0;
1260 }
1261 return retval; 1269 return retval;
1262} 1270}
1263 1271
@@ -1270,10 +1278,10 @@ retry:
1270static void drain_mmlist(void) 1278static void drain_mmlist(void)
1271{ 1279{
1272 struct list_head *p, *next; 1280 struct list_head *p, *next;
1273 unsigned int i; 1281 unsigned int type;
1274 1282
1275 for (i = 0; i < nr_swapfiles; i++) 1283 for (type = 0; type < nr_swapfiles; type++)
1276 if (swap_info[i].inuse_pages) 1284 if (swap_info[type]->inuse_pages)
1277 return; 1285 return;
1278 spin_lock(&mmlist_lock); 1286 spin_lock(&mmlist_lock);
1279 list_for_each_safe(p, next, &init_mm.mmlist) 1287 list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1283,12 +1291,23 @@ static void drain_mmlist(void)
1283 1291
1284/* 1292/*
1285 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1293 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1286 * corresponds to page offset `offset'. 1294 * corresponds to page offset for the specified swap entry.
1295 * Note that the type of this function is sector_t, but it returns page offset
1296 * into the bdev, not sector offset.
1287 */ 1297 */
1288sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1298static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1289{ 1299{
1290 struct swap_extent *se = sis->curr_swap_extent; 1300 struct swap_info_struct *sis;
1291 struct swap_extent *start_se = se; 1301 struct swap_extent *start_se;
1302 struct swap_extent *se;
1303 pgoff_t offset;
1304
1305 sis = swap_info[swp_type(entry)];
1306 *bdev = sis->bdev;
1307
1308 offset = swp_offset(entry);
1309 start_se = sis->curr_swap_extent;
1310 se = start_se;
1292 1311
1293 for ( ; ; ) { 1312 for ( ; ; ) {
1294 struct list_head *lh; 1313 struct list_head *lh;
@@ -1298,40 +1317,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
1298 return se->start_block + (offset - se->start_page); 1317 return se->start_block + (offset - se->start_page);
1299 } 1318 }
1300 lh = se->list.next; 1319 lh = se->list.next;
1301 if (lh == &sis->extent_list)
1302 lh = lh->next;
1303 se = list_entry(lh, struct swap_extent, list); 1320 se = list_entry(lh, struct swap_extent, list);
1304 sis->curr_swap_extent = se; 1321 sis->curr_swap_extent = se;
1305 BUG_ON(se == start_se); /* It *must* be present */ 1322 BUG_ON(se == start_se); /* It *must* be present */
1306 } 1323 }
1307} 1324}
1308 1325
1309#ifdef CONFIG_HIBERNATION
1310/* 1326/*
1311 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1327 * Returns the page offset into bdev for the specified page's swap entry.
1312 * corresponding to given index in swap_info (swap type).
1313 */ 1328 */
1314sector_t swapdev_block(int swap_type, pgoff_t offset) 1329sector_t map_swap_page(struct page *page, struct block_device **bdev)
1315{ 1330{
1316 struct swap_info_struct *sis; 1331 swp_entry_t entry;
1317 1332 entry.val = page_private(page);
1318 if (swap_type >= nr_swapfiles) 1333 return map_swap_entry(entry, bdev);
1319 return 0;
1320
1321 sis = swap_info + swap_type;
1322 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1323} 1334}
1324#endif /* CONFIG_HIBERNATION */
1325 1335
1326/* 1336/*
1327 * Free all of a swapdev's extent information 1337 * Free all of a swapdev's extent information
1328 */ 1338 */
1329static void destroy_swap_extents(struct swap_info_struct *sis) 1339static void destroy_swap_extents(struct swap_info_struct *sis)
1330{ 1340{
1331 while (!list_empty(&sis->extent_list)) { 1341 while (!list_empty(&sis->first_swap_extent.list)) {
1332 struct swap_extent *se; 1342 struct swap_extent *se;
1333 1343
1334 se = list_entry(sis->extent_list.next, 1344 se = list_entry(sis->first_swap_extent.list.next,
1335 struct swap_extent, list); 1345 struct swap_extent, list);
1336 list_del(&se->list); 1346 list_del(&se->list);
1337 kfree(se); 1347 kfree(se);
@@ -1352,8 +1362,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1352 struct swap_extent *new_se; 1362 struct swap_extent *new_se;
1353 struct list_head *lh; 1363 struct list_head *lh;
1354 1364
1355 lh = sis->extent_list.prev; /* The highest page extent */ 1365 if (start_page == 0) {
1356 if (lh != &sis->extent_list) { 1366 se = &sis->first_swap_extent;
1367 sis->curr_swap_extent = se;
1368 se->start_page = 0;
1369 se->nr_pages = nr_pages;
1370 se->start_block = start_block;
1371 return 1;
1372 } else {
1373 lh = sis->first_swap_extent.list.prev; /* Highest extent */
1357 se = list_entry(lh, struct swap_extent, list); 1374 se = list_entry(lh, struct swap_extent, list);
1358 BUG_ON(se->start_page + se->nr_pages != start_page); 1375 BUG_ON(se->start_page + se->nr_pages != start_page);
1359 if (se->start_block + se->nr_pages == start_block) { 1376 if (se->start_block + se->nr_pages == start_block) {
@@ -1373,7 +1390,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1373 new_se->nr_pages = nr_pages; 1390 new_se->nr_pages = nr_pages;
1374 new_se->start_block = start_block; 1391 new_se->start_block = start_block;
1375 1392
1376 list_add_tail(&new_se->list, &sis->extent_list); 1393 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1377 return 1; 1394 return 1;
1378} 1395}
1379 1396
@@ -1425,7 +1442,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1425 if (S_ISBLK(inode->i_mode)) { 1442 if (S_ISBLK(inode->i_mode)) {
1426 ret = add_swap_extent(sis, 0, sis->max, 0); 1443 ret = add_swap_extent(sis, 0, sis->max, 0);
1427 *span = sis->pages; 1444 *span = sis->pages;
1428 goto done; 1445 goto out;
1429 } 1446 }
1430 1447
1431 blkbits = inode->i_blkbits; 1448 blkbits = inode->i_blkbits;
@@ -1496,25 +1513,22 @@ reprobe:
1496 sis->max = page_no; 1513 sis->max = page_no;
1497 sis->pages = page_no - 1; 1514 sis->pages = page_no - 1;
1498 sis->highest_bit = page_no - 1; 1515 sis->highest_bit = page_no - 1;
1499done: 1516out:
1500 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1517 return ret;
1501 struct swap_extent, list);
1502 goto out;
1503bad_bmap: 1518bad_bmap:
1504 printk(KERN_ERR "swapon: swapfile has holes\n"); 1519 printk(KERN_ERR "swapon: swapfile has holes\n");
1505 ret = -EINVAL; 1520 ret = -EINVAL;
1506out: 1521 goto out;
1507 return ret;
1508} 1522}
1509 1523
1510SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1524SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1511{ 1525{
1512 struct swap_info_struct * p = NULL; 1526 struct swap_info_struct *p = NULL;
1513 unsigned short *swap_map; 1527 unsigned char *swap_map;
1514 struct file *swap_file, *victim; 1528 struct file *swap_file, *victim;
1515 struct address_space *mapping; 1529 struct address_space *mapping;
1516 struct inode *inode; 1530 struct inode *inode;
1517 char * pathname; 1531 char *pathname;
1518 int i, type, prev; 1532 int i, type, prev;
1519 int err; 1533 int err;
1520 1534
@@ -1535,8 +1549,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1535 mapping = victim->f_mapping; 1549 mapping = victim->f_mapping;
1536 prev = -1; 1550 prev = -1;
1537 spin_lock(&swap_lock); 1551 spin_lock(&swap_lock);
1538 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1552 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1539 p = swap_info + type; 1553 p = swap_info[type];
1540 if (p->flags & SWP_WRITEOK) { 1554 if (p->flags & SWP_WRITEOK) {
1541 if (p->swap_file->f_mapping == mapping) 1555 if (p->swap_file->f_mapping == mapping)
1542 break; 1556 break;
@@ -1555,18 +1569,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1555 spin_unlock(&swap_lock); 1569 spin_unlock(&swap_lock);
1556 goto out_dput; 1570 goto out_dput;
1557 } 1571 }
1558 if (prev < 0) { 1572 if (prev < 0)
1559 swap_list.head = p->next; 1573 swap_list.head = p->next;
1560 } else { 1574 else
1561 swap_info[prev].next = p->next; 1575 swap_info[prev]->next = p->next;
1562 }
1563 if (type == swap_list.next) { 1576 if (type == swap_list.next) {
1564 /* just pick something that's safe... */ 1577 /* just pick something that's safe... */
1565 swap_list.next = swap_list.head; 1578 swap_list.next = swap_list.head;
1566 } 1579 }
1567 if (p->prio < 0) { 1580 if (p->prio < 0) {
1568 for (i = p->next; i >= 0; i = swap_info[i].next) 1581 for (i = p->next; i >= 0; i = swap_info[i]->next)
1569 swap_info[i].prio = p->prio--; 1582 swap_info[i]->prio = p->prio--;
1570 least_priority++; 1583 least_priority++;
1571 } 1584 }
1572 nr_swap_pages -= p->pages; 1585 nr_swap_pages -= p->pages;
@@ -1584,16 +1597,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1584 if (p->prio < 0) 1597 if (p->prio < 0)
1585 p->prio = --least_priority; 1598 p->prio = --least_priority;
1586 prev = -1; 1599 prev = -1;
1587 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1600 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1588 if (p->prio >= swap_info[i].prio) 1601 if (p->prio >= swap_info[i]->prio)
1589 break; 1602 break;
1590 prev = i; 1603 prev = i;
1591 } 1604 }
1592 p->next = i; 1605 p->next = i;
1593 if (prev < 0) 1606 if (prev < 0)
1594 swap_list.head = swap_list.next = p - swap_info; 1607 swap_list.head = swap_list.next = type;
1595 else 1608 else
1596 swap_info[prev].next = p - swap_info; 1609 swap_info[prev]->next = type;
1597 nr_swap_pages += p->pages; 1610 nr_swap_pages += p->pages;
1598 total_swap_pages += p->pages; 1611 total_swap_pages += p->pages;
1599 p->flags |= SWP_WRITEOK; 1612 p->flags |= SWP_WRITEOK;
@@ -1606,6 +1619,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1606 up_write(&swap_unplug_sem); 1619 up_write(&swap_unplug_sem);
1607 1620
1608 destroy_swap_extents(p); 1621 destroy_swap_extents(p);
1622 if (p->flags & SWP_CONTINUED)
1623 free_swap_count_continuations(p);
1624
1609 mutex_lock(&swapon_mutex); 1625 mutex_lock(&swapon_mutex);
1610 spin_lock(&swap_lock); 1626 spin_lock(&swap_lock);
1611 drain_mmlist(); 1627 drain_mmlist();
@@ -1653,8 +1669,8 @@ out:
1653/* iterator */ 1669/* iterator */
1654static void *swap_start(struct seq_file *swap, loff_t *pos) 1670static void *swap_start(struct seq_file *swap, loff_t *pos)
1655{ 1671{
1656 struct swap_info_struct *ptr = swap_info; 1672 struct swap_info_struct *si;
1657 int i; 1673 int type;
1658 loff_t l = *pos; 1674 loff_t l = *pos;
1659 1675
1660 mutex_lock(&swapon_mutex); 1676 mutex_lock(&swapon_mutex);
@@ -1662,11 +1678,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1662 if (!l) 1678 if (!l)
1663 return SEQ_START_TOKEN; 1679 return SEQ_START_TOKEN;
1664 1680
1665 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1681 for (type = 0; type < nr_swapfiles; type++) {
1666 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1682 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1683 si = swap_info[type];
1684 if (!(si->flags & SWP_USED) || !si->swap_map)
1667 continue; 1685 continue;
1668 if (!--l) 1686 if (!--l)
1669 return ptr; 1687 return si;
1670 } 1688 }
1671 1689
1672 return NULL; 1690 return NULL;
@@ -1674,21 +1692,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1674 1692
1675static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1693static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1676{ 1694{
1677 struct swap_info_struct *ptr; 1695 struct swap_info_struct *si = v;
1678 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1696 int type;
1679 1697
1680 if (v == SEQ_START_TOKEN) 1698 if (v == SEQ_START_TOKEN)
1681 ptr = swap_info; 1699 type = 0;
1682 else { 1700 else
1683 ptr = v; 1701 type = si->type + 1;
1684 ptr++;
1685 }
1686 1702
1687 for (; ptr < endptr; ptr++) { 1703 for (; type < nr_swapfiles; type++) {
1688 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1704 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1705 si = swap_info[type];
1706 if (!(si->flags & SWP_USED) || !si->swap_map)
1689 continue; 1707 continue;
1690 ++*pos; 1708 ++*pos;
1691 return ptr; 1709 return si;
1692 } 1710 }
1693 1711
1694 return NULL; 1712 return NULL;
@@ -1701,24 +1719,24 @@ static void swap_stop(struct seq_file *swap, void *v)
1701 1719
1702static int swap_show(struct seq_file *swap, void *v) 1720static int swap_show(struct seq_file *swap, void *v)
1703{ 1721{
1704 struct swap_info_struct *ptr = v; 1722 struct swap_info_struct *si = v;
1705 struct file *file; 1723 struct file *file;
1706 int len; 1724 int len;
1707 1725
1708 if (ptr == SEQ_START_TOKEN) { 1726 if (si == SEQ_START_TOKEN) {
1709 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1727 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1710 return 0; 1728 return 0;
1711 } 1729 }
1712 1730
1713 file = ptr->swap_file; 1731 file = si->swap_file;
1714 len = seq_path(swap, &file->f_path, " \t\n\\"); 1732 len = seq_path(swap, &file->f_path, " \t\n\\");
1715 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1733 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1716 len < 40 ? 40 - len : 1, " ", 1734 len < 40 ? 40 - len : 1, " ",
1717 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1735 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1718 "partition" : "file\t", 1736 "partition" : "file\t",
1719 ptr->pages << (PAGE_SHIFT - 10), 1737 si->pages << (PAGE_SHIFT - 10),
1720 ptr->inuse_pages << (PAGE_SHIFT - 10), 1738 si->inuse_pages << (PAGE_SHIFT - 10),
1721 ptr->prio); 1739 si->prio);
1722 return 0; 1740 return 0;
1723} 1741}
1724 1742
@@ -1765,7 +1783,7 @@ late_initcall(max_swapfiles_check);
1765 */ 1783 */
1766SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1784SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1767{ 1785{
1768 struct swap_info_struct * p; 1786 struct swap_info_struct *p;
1769 char *name = NULL; 1787 char *name = NULL;
1770 struct block_device *bdev = NULL; 1788 struct block_device *bdev = NULL;
1771 struct file *swap_file = NULL; 1789 struct file *swap_file = NULL;
@@ -1773,36 +1791,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1773 unsigned int type; 1791 unsigned int type;
1774 int i, prev; 1792 int i, prev;
1775 int error; 1793 int error;
1776 union swap_header *swap_header = NULL; 1794 union swap_header *swap_header;
1777 unsigned int nr_good_pages = 0; 1795 unsigned int nr_good_pages;
1778 int nr_extents = 0; 1796 int nr_extents = 0;
1779 sector_t span; 1797 sector_t span;
1780 unsigned long maxpages = 1; 1798 unsigned long maxpages;
1781 unsigned long swapfilepages; 1799 unsigned long swapfilepages;
1782 unsigned short *swap_map = NULL; 1800 unsigned char *swap_map = NULL;
1783 struct page *page = NULL; 1801 struct page *page = NULL;
1784 struct inode *inode = NULL; 1802 struct inode *inode = NULL;
1785 int did_down = 0; 1803 int did_down = 0;
1786 1804
1787 if (!capable(CAP_SYS_ADMIN)) 1805 if (!capable(CAP_SYS_ADMIN))
1788 return -EPERM; 1806 return -EPERM;
1807
1808 p = kzalloc(sizeof(*p), GFP_KERNEL);
1809 if (!p)
1810 return -ENOMEM;
1811
1789 spin_lock(&swap_lock); 1812 spin_lock(&swap_lock);
1790 p = swap_info; 1813 for (type = 0; type < nr_swapfiles; type++) {
1791 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1814 if (!(swap_info[type]->flags & SWP_USED))
1792 if (!(p->flags & SWP_USED))
1793 break; 1815 break;
1816 }
1794 error = -EPERM; 1817 error = -EPERM;
1795 if (type >= MAX_SWAPFILES) { 1818 if (type >= MAX_SWAPFILES) {
1796 spin_unlock(&swap_lock); 1819 spin_unlock(&swap_lock);
1820 kfree(p);
1797 goto out; 1821 goto out;
1798 } 1822 }
1799 if (type >= nr_swapfiles) 1823 if (type >= nr_swapfiles) {
1800 nr_swapfiles = type+1; 1824 p->type = type;
1801 memset(p, 0, sizeof(*p)); 1825 swap_info[type] = p;
1802 INIT_LIST_HEAD(&p->extent_list); 1826 /*
1827 * Write swap_info[type] before nr_swapfiles, in case a
1828 * racing procfs swap_start() or swap_next() is reading them.
1829 * (We never shrink nr_swapfiles, we never free this entry.)
1830 */
1831 smp_wmb();
1832 nr_swapfiles++;
1833 } else {
1834 kfree(p);
1835 p = swap_info[type];
1836 /*
1837 * Do not memset this entry: a racing procfs swap_next()
1838 * would be relying on p->type to remain valid.
1839 */
1840 }
1841 INIT_LIST_HEAD(&p->first_swap_extent.list);
1803 p->flags = SWP_USED; 1842 p->flags = SWP_USED;
1804 p->next = -1; 1843 p->next = -1;
1805 spin_unlock(&swap_lock); 1844 spin_unlock(&swap_lock);
1845
1806 name = getname(specialfile); 1846 name = getname(specialfile);
1807 error = PTR_ERR(name); 1847 error = PTR_ERR(name);
1808 if (IS_ERR(name)) { 1848 if (IS_ERR(name)) {
@@ -1822,7 +1862,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1822 1862
1823 error = -EBUSY; 1863 error = -EBUSY;
1824 for (i = 0; i < nr_swapfiles; i++) { 1864 for (i = 0; i < nr_swapfiles; i++) {
1825 struct swap_info_struct *q = &swap_info[i]; 1865 struct swap_info_struct *q = swap_info[i];
1826 1866
1827 if (i == type || !q->swap_file) 1867 if (i == type || !q->swap_file)
1828 continue; 1868 continue;
@@ -1897,6 +1937,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1897 1937
1898 p->lowest_bit = 1; 1938 p->lowest_bit = 1;
1899 p->cluster_next = 1; 1939 p->cluster_next = 1;
1940 p->cluster_nr = 0;
1900 1941
1901 /* 1942 /*
1902 * Find out how many pages are allowed for a single swap 1943 * Find out how many pages are allowed for a single swap
@@ -1913,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1913 * swap pte. 1954 * swap pte.
1914 */ 1955 */
1915 maxpages = swp_offset(pte_to_swp_entry( 1956 maxpages = swp_offset(pte_to_swp_entry(
1916 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1957 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1917 if (maxpages > swap_header->info.last_page) 1958 if (maxpages > swap_header->info.last_page) {
1918 maxpages = swap_header->info.last_page; 1959 maxpages = swap_header->info.last_page + 1;
1960 /* p->max is an unsigned int: don't overflow it */
1961 if ((unsigned int)maxpages == 0)
1962 maxpages = UINT_MAX;
1963 }
1919 p->highest_bit = maxpages - 1; 1964 p->highest_bit = maxpages - 1;
1920 1965
1921 error = -EINVAL; 1966 error = -EINVAL;
@@ -1932,30 +1977,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1932 goto bad_swap; 1977 goto bad_swap;
1933 1978
1934 /* OK, set up the swap map and apply the bad block list */ 1979 /* OK, set up the swap map and apply the bad block list */
1935 swap_map = vmalloc(maxpages * sizeof(short)); 1980 swap_map = vmalloc(maxpages);
1936 if (!swap_map) { 1981 if (!swap_map) {
1937 error = -ENOMEM; 1982 error = -ENOMEM;
1938 goto bad_swap; 1983 goto bad_swap;
1939 } 1984 }
1940 1985
1941 memset(swap_map, 0, maxpages * sizeof(short)); 1986 memset(swap_map, 0, maxpages);
1987 nr_good_pages = maxpages - 1; /* omit header page */
1988
1942 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1989 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1943 int page_nr = swap_header->info.badpages[i]; 1990 unsigned int page_nr = swap_header->info.badpages[i];
1944 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1991 if (page_nr == 0 || page_nr > swap_header->info.last_page) {
1945 error = -EINVAL; 1992 error = -EINVAL;
1946 goto bad_swap; 1993 goto bad_swap;
1947 } 1994 }
1948 swap_map[page_nr] = SWAP_MAP_BAD; 1995 if (page_nr < maxpages) {
1996 swap_map[page_nr] = SWAP_MAP_BAD;
1997 nr_good_pages--;
1998 }
1949 } 1999 }
1950 2000
1951 error = swap_cgroup_swapon(type, maxpages); 2001 error = swap_cgroup_swapon(type, maxpages);
1952 if (error) 2002 if (error)
1953 goto bad_swap; 2003 goto bad_swap;
1954 2004
1955 nr_good_pages = swap_header->info.last_page -
1956 swap_header->info.nr_badpages -
1957 1 /* header page */;
1958
1959 if (nr_good_pages) { 2005 if (nr_good_pages) {
1960 swap_map[0] = SWAP_MAP_BAD; 2006 swap_map[0] = SWAP_MAP_BAD;
1961 p->max = maxpages; 2007 p->max = maxpages;
@@ -2003,18 +2049,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2003 2049
2004 /* insert swap space into swap_list: */ 2050 /* insert swap space into swap_list: */
2005 prev = -1; 2051 prev = -1;
2006 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2052 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2007 if (p->prio >= swap_info[i].prio) { 2053 if (p->prio >= swap_info[i]->prio)
2008 break; 2054 break;
2009 }
2010 prev = i; 2055 prev = i;
2011 } 2056 }
2012 p->next = i; 2057 p->next = i;
2013 if (prev < 0) { 2058 if (prev < 0)
2014 swap_list.head = swap_list.next = p - swap_info; 2059 swap_list.head = swap_list.next = type;
2015 } else { 2060 else
2016 swap_info[prev].next = p - swap_info; 2061 swap_info[prev]->next = type;
2017 }
2018 spin_unlock(&swap_lock); 2062 spin_unlock(&swap_lock);
2019 mutex_unlock(&swapon_mutex); 2063 mutex_unlock(&swapon_mutex);
2020 error = 0; 2064 error = 0;
@@ -2051,15 +2095,15 @@ out:
2051 2095
2052void si_swapinfo(struct sysinfo *val) 2096void si_swapinfo(struct sysinfo *val)
2053{ 2097{
2054 unsigned int i; 2098 unsigned int type;
2055 unsigned long nr_to_be_unused = 0; 2099 unsigned long nr_to_be_unused = 0;
2056 2100
2057 spin_lock(&swap_lock); 2101 spin_lock(&swap_lock);
2058 for (i = 0; i < nr_swapfiles; i++) { 2102 for (type = 0; type < nr_swapfiles; type++) {
2059 if (!(swap_info[i].flags & SWP_USED) || 2103 struct swap_info_struct *si = swap_info[type];
2060 (swap_info[i].flags & SWP_WRITEOK)) 2104
2061 continue; 2105 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2062 nr_to_be_unused += swap_info[i].inuse_pages; 2106 nr_to_be_unused += si->inuse_pages;
2063 } 2107 }
2064 val->freeswap = nr_swap_pages + nr_to_be_unused; 2108 val->freeswap = nr_swap_pages + nr_to_be_unused;
2065 val->totalswap = total_swap_pages + nr_to_be_unused; 2109 val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2069,101 +2113,111 @@ void si_swapinfo(struct sysinfo *val)
2069/* 2113/*
2070 * Verify that a swap entry is valid and increment its swap map count. 2114 * Verify that a swap entry is valid and increment its swap map count.
2071 * 2115 *
2072 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2073 * "permanent", but will be reclaimed by the next swapoff.
2074 * Returns error code in following case. 2116 * Returns error code in following case.
2075 * - success -> 0 2117 * - success -> 0
2076 * - swp_entry is invalid -> EINVAL 2118 * - swp_entry is invalid -> EINVAL
2077 * - swp_entry is migration entry -> EINVAL 2119 * - swp_entry is migration entry -> EINVAL
2078 * - swap-cache reference is requested but there is already one. -> EEXIST 2120 * - swap-cache reference is requested but there is already one. -> EEXIST
2079 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2121 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2122 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2080 */ 2123 */
2081static int __swap_duplicate(swp_entry_t entry, bool cache) 2124static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2082{ 2125{
2083 struct swap_info_struct * p; 2126 struct swap_info_struct *p;
2084 unsigned long offset, type; 2127 unsigned long offset, type;
2085 int result = -EINVAL; 2128 unsigned char count;
2086 int count; 2129 unsigned char has_cache;
2087 bool has_cache; 2130 int err = -EINVAL;
2088 2131
2089 if (non_swap_entry(entry)) 2132 if (non_swap_entry(entry))
2090 return -EINVAL; 2133 goto out;
2091 2134
2092 type = swp_type(entry); 2135 type = swp_type(entry);
2093 if (type >= nr_swapfiles) 2136 if (type >= nr_swapfiles)
2094 goto bad_file; 2137 goto bad_file;
2095 p = type + swap_info; 2138 p = swap_info[type];
2096 offset = swp_offset(entry); 2139 offset = swp_offset(entry);
2097 2140
2098 spin_lock(&swap_lock); 2141 spin_lock(&swap_lock);
2099
2100 if (unlikely(offset >= p->max)) 2142 if (unlikely(offset >= p->max))
2101 goto unlock_out; 2143 goto unlock_out;
2102 2144
2103 count = swap_count(p->swap_map[offset]); 2145 count = p->swap_map[offset];
2104 has_cache = swap_has_cache(p->swap_map[offset]); 2146 has_cache = count & SWAP_HAS_CACHE;
2147 count &= ~SWAP_HAS_CACHE;
2148 err = 0;
2105 2149
2106 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2150 if (usage == SWAP_HAS_CACHE) {
2107 2151
2108 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2152 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2109 if (!has_cache && count) { 2153 if (!has_cache && count)
2110 p->swap_map[offset] = encode_swapmap(count, true); 2154 has_cache = SWAP_HAS_CACHE;
2111 result = 0; 2155 else if (has_cache) /* someone else added cache */
2112 } else if (has_cache) /* someone added cache */ 2156 err = -EEXIST;
2113 result = -EEXIST; 2157 else /* no users remaining */
2114 else if (!count) /* no users */ 2158 err = -ENOENT;
2115 result = -ENOENT;
2116 2159
2117 } else if (count || has_cache) { 2160 } else if (count || has_cache) {
2118 if (count < SWAP_MAP_MAX - 1) { 2161
2119 p->swap_map[offset] = encode_swapmap(count + 1, 2162 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2120 has_cache); 2163 count += usage;
2121 result = 0; 2164 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2122 } else if (count <= SWAP_MAP_MAX) { 2165 err = -EINVAL;
2123 if (swap_overflow++ < 5) 2166 else if (swap_count_continued(p, offset, count))
2124 printk(KERN_WARNING 2167 count = COUNT_CONTINUED;
2125 "swap_dup: swap entry overflow\n"); 2168 else
2126 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2169 err = -ENOMEM;
2127 has_cache);
2128 result = 0;
2129 }
2130 } else 2170 } else
2131 result = -ENOENT; /* unused swap entry */ 2171 err = -ENOENT; /* unused swap entry */
2172
2173 p->swap_map[offset] = count | has_cache;
2174
2132unlock_out: 2175unlock_out:
2133 spin_unlock(&swap_lock); 2176 spin_unlock(&swap_lock);
2134out: 2177out:
2135 return result; 2178 return err;
2136 2179
2137bad_file: 2180bad_file:
2138 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2181 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2139 goto out; 2182 goto out;
2140} 2183}
2184
2185/*
2186 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2187 * (in which case its reference count is never incremented).
2188 */
2189void swap_shmem_alloc(swp_entry_t entry)
2190{
2191 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2192}
2193
2141/* 2194/*
2142 * increase reference count of swap entry by 1. 2195 * Increase reference count of swap entry by 1.
2196 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2197 * but could not be atomically allocated. Returns 0, just as if it succeeded,
2198 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2199 * might occur if a page table entry has got corrupted.
2143 */ 2200 */
2144void swap_duplicate(swp_entry_t entry) 2201int swap_duplicate(swp_entry_t entry)
2145{ 2202{
2146 __swap_duplicate(entry, SWAP_MAP); 2203 int err = 0;
2204
2205 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2206 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2207 return err;
2147} 2208}
2148 2209
2149/* 2210/*
2150 * @entry: swap entry for which we allocate swap cache. 2211 * @entry: swap entry for which we allocate swap cache.
2151 * 2212 *
2152 * Called when allocating swap cache for exising swap entry, 2213 * Called when allocating swap cache for existing swap entry,
2153 * This can return error codes. Returns 0 at success. 2214 * This can return error codes. Returns 0 at success.
2154 * -EBUSY means there is a swap cache. 2215 * -EBUSY means there is a swap cache.
2155 * Note: return code is different from swap_duplicate(). 2216 * Note: return code is different from swap_duplicate().
2156 */ 2217 */
2157int swapcache_prepare(swp_entry_t entry) 2218int swapcache_prepare(swp_entry_t entry)
2158{ 2219{
2159 return __swap_duplicate(entry, SWAP_CACHE); 2220 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2160}
2161
2162
2163struct swap_info_struct *
2164get_swap_info_struct(unsigned type)
2165{
2166 return &swap_info[type];
2167} 2221}
2168 2222
2169/* 2223/*
@@ -2181,7 +2235,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2181 if (!our_page_cluster) /* no readahead */ 2235 if (!our_page_cluster) /* no readahead */
2182 return 0; 2236 return 0;
2183 2237
2184 si = &swap_info[swp_type(entry)]; 2238 si = swap_info[swp_type(entry)];
2185 target = swp_offset(entry); 2239 target = swp_offset(entry);
2186 base = (target >> our_page_cluster) << our_page_cluster; 2240 base = (target >> our_page_cluster) << our_page_cluster;
2187 end = base + (1 << our_page_cluster); 2241 end = base + (1 << our_page_cluster);
@@ -2217,3 +2271,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2217 *offset = ++toff; 2271 *offset = ++toff;
2218 return nr_pages? ++nr_pages: 0; 2272 return nr_pages? ++nr_pages: 0;
2219} 2273}
2274
2275/*
2276 * add_swap_count_continuation - called when a swap count is duplicated
2277 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2278 * page of the original vmalloc'ed swap_map, to hold the continuation count
2279 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
2280 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2281 *
2282 * These continuation pages are seldom referenced: the common paths all work
2283 * on the original swap_map, only referring to a continuation page when the
2284 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2285 *
2286 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2287 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2288 * can be called after dropping locks.
2289 */
2290int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2291{
2292 struct swap_info_struct *si;
2293 struct page *head;
2294 struct page *page;
2295 struct page *list_page;
2296 pgoff_t offset;
2297 unsigned char count;
2298
2299 /*
2300 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2301 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2302 */
2303 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2304
2305 si = swap_info_get(entry);
2306 if (!si) {
2307 /*
2308 * An acceptable race has occurred since the failing
2309 * __swap_duplicate(): the swap entry has been freed,
2310 * perhaps even the whole swap_map cleared for swapoff.
2311 */
2312 goto outer;
2313 }
2314
2315 offset = swp_offset(entry);
2316 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2317
2318 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2319 /*
2320 * The higher the swap count, the more likely it is that tasks
2321 * will race to add swap count continuation: we need to avoid
2322 * over-provisioning.
2323 */
2324 goto out;
2325 }
2326
2327 if (!page) {
2328 spin_unlock(&swap_lock);
2329 return -ENOMEM;
2330 }
2331
2332 /*
2333 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2334 * no architecture is using highmem pages for kernel pagetables: so it
2335 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2336 */
2337 head = vmalloc_to_page(si->swap_map + offset);
2338 offset &= ~PAGE_MASK;
2339
2340 /*
2341 * Page allocation does not initialize the page's lru field,
2342 * but it does always reset its private field.
2343 */
2344 if (!page_private(head)) {
2345 BUG_ON(count & COUNT_CONTINUED);
2346 INIT_LIST_HEAD(&head->lru);
2347 set_page_private(head, SWP_CONTINUED);
2348 si->flags |= SWP_CONTINUED;
2349 }
2350
2351 list_for_each_entry(list_page, &head->lru, lru) {
2352 unsigned char *map;
2353
2354 /*
2355 * If the previous map said no continuation, but we've found
2356 * a continuation page, free our allocation and use this one.
2357 */
2358 if (!(count & COUNT_CONTINUED))
2359 goto out;
2360
2361 map = kmap_atomic(list_page, KM_USER0) + offset;
2362 count = *map;
2363 kunmap_atomic(map, KM_USER0);
2364
2365 /*
2366 * If this continuation count now has some space in it,
2367 * free our allocation and use this one.
2368 */
2369 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2370 goto out;
2371 }
2372
2373 list_add_tail(&page->lru, &head->lru);
2374 page = NULL; /* now it's attached, don't free it */
2375out:
2376 spin_unlock(&swap_lock);
2377outer:
2378 if (page)
2379 __free_page(page);
2380 return 0;
2381}
2382
2383/*
2384 * swap_count_continued - when the original swap_map count is incremented
2385 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2386 * into, carry if so, or else fail until a new continuation page is allocated;
2387 * when the original swap_map count is decremented from 0 with continuation,
2388 * borrow from the continuation and report whether it still holds more.
2389 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2390 */
2391static bool swap_count_continued(struct swap_info_struct *si,
2392 pgoff_t offset, unsigned char count)
2393{
2394 struct page *head;
2395 struct page *page;
2396 unsigned char *map;
2397
2398 head = vmalloc_to_page(si->swap_map + offset);
2399 if (page_private(head) != SWP_CONTINUED) {
2400 BUG_ON(count & COUNT_CONTINUED);
2401 return false; /* need to add count continuation */
2402 }
2403
2404 offset &= ~PAGE_MASK;
2405 page = list_entry(head->lru.next, struct page, lru);
2406 map = kmap_atomic(page, KM_USER0) + offset;
2407
2408 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2409 goto init_map; /* jump over SWAP_CONT_MAX checks */
2410
2411 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2412 /*
2413 * Think of how you add 1 to 999
2414 */
2415 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2416 kunmap_atomic(map, KM_USER0);
2417 page = list_entry(page->lru.next, struct page, lru);
2418 BUG_ON(page == head);
2419 map = kmap_atomic(page, KM_USER0) + offset;
2420 }
2421 if (*map == SWAP_CONT_MAX) {
2422 kunmap_atomic(map, KM_USER0);
2423 page = list_entry(page->lru.next, struct page, lru);
2424 if (page == head)
2425 return false; /* add count continuation */
2426 map = kmap_atomic(page, KM_USER0) + offset;
2427init_map: *map = 0; /* we didn't zero the page */
2428 }
2429 *map += 1;
2430 kunmap_atomic(map, KM_USER0);
2431 page = list_entry(page->lru.prev, struct page, lru);
2432 while (page != head) {
2433 map = kmap_atomic(page, KM_USER0) + offset;
2434 *map = COUNT_CONTINUED;
2435 kunmap_atomic(map, KM_USER0);
2436 page = list_entry(page->lru.prev, struct page, lru);
2437 }
2438 return true; /* incremented */
2439
2440 } else { /* decrementing */
2441 /*
2442 * Think of how you subtract 1 from 1000
2443 */
2444 BUG_ON(count != COUNT_CONTINUED);
2445 while (*map == COUNT_CONTINUED) {
2446 kunmap_atomic(map, KM_USER0);
2447 page = list_entry(page->lru.next, struct page, lru);
2448 BUG_ON(page == head);
2449 map = kmap_atomic(page, KM_USER0) + offset;
2450 }
2451 BUG_ON(*map == 0);
2452 *map -= 1;
2453 if (*map == 0)
2454 count = 0;
2455 kunmap_atomic(map, KM_USER0);
2456 page = list_entry(page->lru.prev, struct page, lru);
2457 while (page != head) {
2458 map = kmap_atomic(page, KM_USER0) + offset;
2459 *map = SWAP_CONT_MAX | count;
2460 count = COUNT_CONTINUED;
2461 kunmap_atomic(map, KM_USER0);
2462 page = list_entry(page->lru.prev, struct page, lru);
2463 }
2464 return count == COUNT_CONTINUED;
2465 }
2466}
2467
2468/*
2469 * free_swap_count_continuations - swapoff free all the continuation pages
2470 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2471 */
2472static void free_swap_count_continuations(struct swap_info_struct *si)
2473{
2474 pgoff_t offset;
2475
2476 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2477 struct page *head;
2478 head = vmalloc_to_page(si->swap_map + offset);
2479 if (page_private(head)) {
2480 struct list_head *this, *next;
2481 list_for_each_safe(this, next, &head->lru) {
2482 struct page *page;
2483 page = list_entry(this, struct page, lru);
2484 list_del(this);
2485 __free_page(page);
2486 }
2487 }
2488 }
2489}