diff options
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 918 |
1 files changed, 594 insertions, 324 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/ksm.h> | ||
25 | #include <linux/rmap.h> | 26 | #include <linux/rmap.h> |
26 | #include <linux/security.h> | 27 | #include <linux/security.h> |
27 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
@@ -35,11 +36,15 @@ | |||
35 | #include <linux/swapops.h> | 36 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 37 | #include <linux/page_cgroup.h> |
37 | 38 | ||
39 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
40 | unsigned char); | ||
41 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
42 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | ||
43 | |||
38 | static DEFINE_SPINLOCK(swap_lock); | 44 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 45 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 46 | long nr_swap_pages; |
41 | long total_swap_pages; | 47 | long total_swap_pages; |
42 | static int swap_overflow; | ||
43 | static int least_priority; | 48 | static int least_priority; |
44 | 49 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 50 | static const char Bad_file[] = "Bad swap file entry "; |
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
49 | 54 | ||
50 | static struct swap_list_t swap_list = {-1, -1}; | 55 | static struct swap_list_t swap_list = {-1, -1}; |
51 | 56 | ||
52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 57 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
53 | 58 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 59 | static DEFINE_MUTEX(swapon_mutex); |
55 | 60 | ||
56 | /* For reference count accounting in swap_map */ | 61 | static inline unsigned char swap_count(unsigned char ent) |
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | ||
70 | return !!(ent & SWAP_HAS_CACHE); | ||
71 | } | ||
72 | |||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
74 | { | 62 | { |
75 | unsigned short ret = count; | 63 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | 64 | } |
81 | 65 | ||
82 | /* returnes 1 if swap entry is freed */ | 66 | /* returns 1 if swap entry is freed */ |
83 | static int | 67 | static int |
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 68 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
85 | { | 69 | { |
86 | int type = si - swap_info; | 70 | swp_entry_t entry = swp_entry(si->type, offset); |
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | 71 | struct page *page; |
89 | int ret = 0; | 72 | int ret = 0; |
90 | 73 | ||
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
120 | down_read(&swap_unplug_sem); | 103 | down_read(&swap_unplug_sem); |
121 | entry.val = page_private(page); | 104 | entry.val = page_private(page); |
122 | if (PageSwapCache(page)) { | 105 | if (PageSwapCache(page)) { |
123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 106 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; |
124 | struct backing_dev_info *bdi; | 107 | struct backing_dev_info *bdi; |
125 | 108 | ||
126 | /* | 109 | /* |
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
146 | static int discard_swap(struct swap_info_struct *si) | 129 | static int discard_swap(struct swap_info_struct *si) |
147 | { | 130 | { |
148 | struct swap_extent *se; | 131 | struct swap_extent *se; |
132 | sector_t start_block; | ||
133 | sector_t nr_blocks; | ||
149 | int err = 0; | 134 | int err = 0; |
150 | 135 | ||
151 | list_for_each_entry(se, &si->extent_list, list) { | 136 | /* Do not discard the swap header page! */ |
152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | 137 | se = &si->first_swap_extent; |
153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 138 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | ||
140 | if (nr_blocks) { | ||
141 | err = blkdev_issue_discard(si->bdev, start_block, | ||
142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | ||
143 | if (err) | ||
144 | return err; | ||
145 | cond_resched(); | ||
146 | } | ||
154 | 147 | ||
155 | if (se->start_page == 0) { | 148 | list_for_each_entry(se, &si->first_swap_extent.list, list) { |
156 | /* Do not discard the swap header page! */ | 149 | start_block = se->start_block << (PAGE_SHIFT - 9); |
157 | start_block += 1 << (PAGE_SHIFT - 9); | 150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
159 | if (!nr_blocks) | ||
160 | continue; | ||
161 | } | ||
162 | 151 | ||
163 | err = blkdev_issue_discard(si->bdev, start_block, | 152 | err = blkdev_issue_discard(si->bdev, start_block, |
164 | nr_blocks, GFP_KERNEL, | 153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); |
165 | DISCARD_FL_BARRIER); | ||
166 | if (err) | 154 | if (err) |
167 | break; | 155 | break; |
168 | 156 | ||
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
201 | start_block <<= PAGE_SHIFT - 9; | 189 | start_block <<= PAGE_SHIFT - 9; |
202 | nr_blocks <<= PAGE_SHIFT - 9; | 190 | nr_blocks <<= PAGE_SHIFT - 9; |
203 | if (blkdev_issue_discard(si->bdev, start_block, | 191 | if (blkdev_issue_discard(si->bdev, start_block, |
204 | nr_blocks, GFP_NOIO, | 192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) |
205 | DISCARD_FL_BARRIER)) | ||
206 | break; | 193 | break; |
207 | } | 194 | } |
208 | 195 | ||
209 | lh = se->list.next; | 196 | lh = se->list.next; |
210 | if (lh == &si->extent_list) | ||
211 | lh = lh->next; | ||
212 | se = list_entry(lh, struct swap_extent, list); | 197 | se = list_entry(lh, struct swap_extent, list); |
213 | } | 198 | } |
214 | } | 199 | } |
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word) | |||
223 | #define LATENCY_LIMIT 256 | 208 | #define LATENCY_LIMIT 256 |
224 | 209 | ||
225 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 210 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
226 | int cache) | 211 | unsigned char usage) |
227 | { | 212 | { |
228 | unsigned long offset; | 213 | unsigned long offset; |
229 | unsigned long scan_base; | 214 | unsigned long scan_base; |
@@ -354,10 +339,7 @@ checks: | |||
354 | si->lowest_bit = si->max; | 339 | si->lowest_bit = si->max; |
355 | si->highest_bit = 0; | 340 | si->highest_bit = 0; |
356 | } | 341 | } |
357 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ | 342 | si->swap_map[offset] = usage; |
358 | si->swap_map[offset] = encode_swapmap(0, true); | ||
359 | else /* at suspend */ | ||
360 | si->swap_map[offset] = encode_swapmap(1, false); | ||
361 | si->cluster_next = offset + 1; | 343 | si->cluster_next = offset + 1; |
362 | si->flags -= SWP_SCANNING; | 344 | si->flags -= SWP_SCANNING; |
363 | 345 | ||
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) | |||
467 | nr_swap_pages--; | 449 | nr_swap_pages--; |
468 | 450 | ||
469 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 451 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
470 | si = swap_info + type; | 452 | si = swap_info[type]; |
471 | next = si->next; | 453 | next = si->next; |
472 | if (next < 0 || | 454 | if (next < 0 || |
473 | (!wrapped && si->prio != swap_info[next].prio)) { | 455 | (!wrapped && si->prio != swap_info[next]->prio)) { |
474 | next = swap_list.head; | 456 | next = swap_list.head; |
475 | wrapped++; | 457 | wrapped++; |
476 | } | 458 | } |
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) | |||
482 | 464 | ||
483 | swap_list.next = next; | 465 | swap_list.next = next; |
484 | /* This is called for allocating swap entry for cache */ | 466 | /* This is called for allocating swap entry for cache */ |
485 | offset = scan_swap_map(si, SWAP_CACHE); | 467 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
486 | if (offset) { | 468 | if (offset) { |
487 | spin_unlock(&swap_lock); | 469 | spin_unlock(&swap_lock); |
488 | return swp_entry(type, offset); | 470 | return swp_entry(type, offset); |
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) | |||
503 | pgoff_t offset; | 485 | pgoff_t offset; |
504 | 486 | ||
505 | spin_lock(&swap_lock); | 487 | spin_lock(&swap_lock); |
506 | si = swap_info + type; | 488 | si = swap_info[type]; |
507 | if (si->flags & SWP_WRITEOK) { | 489 | if (si && (si->flags & SWP_WRITEOK)) { |
508 | nr_swap_pages--; | 490 | nr_swap_pages--; |
509 | /* This is called for allocating swap entry, not cache */ | 491 | /* This is called for allocating swap entry, not cache */ |
510 | offset = scan_swap_map(si, SWAP_MAP); | 492 | offset = scan_swap_map(si, 1); |
511 | if (offset) { | 493 | if (offset) { |
512 | spin_unlock(&swap_lock); | 494 | spin_unlock(&swap_lock); |
513 | return swp_entry(type, offset); | 495 | return swp_entry(type, offset); |
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) | |||
518 | return (swp_entry_t) {0}; | 500 | return (swp_entry_t) {0}; |
519 | } | 501 | } |
520 | 502 | ||
521 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 503 | static struct swap_info_struct *swap_info_get(swp_entry_t entry) |
522 | { | 504 | { |
523 | struct swap_info_struct * p; | 505 | struct swap_info_struct *p; |
524 | unsigned long offset, type; | 506 | unsigned long offset, type; |
525 | 507 | ||
526 | if (!entry.val) | 508 | if (!entry.val) |
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) | |||
528 | type = swp_type(entry); | 510 | type = swp_type(entry); |
529 | if (type >= nr_swapfiles) | 511 | if (type >= nr_swapfiles) |
530 | goto bad_nofile; | 512 | goto bad_nofile; |
531 | p = & swap_info[type]; | 513 | p = swap_info[type]; |
532 | if (!(p->flags & SWP_USED)) | 514 | if (!(p->flags & SWP_USED)) |
533 | goto bad_device; | 515 | goto bad_device; |
534 | offset = swp_offset(entry); | 516 | offset = swp_offset(entry); |
@@ -554,41 +536,56 @@ out: | |||
554 | return NULL; | 536 | return NULL; |
555 | } | 537 | } |
556 | 538 | ||
557 | static int swap_entry_free(struct swap_info_struct *p, | 539 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
558 | swp_entry_t ent, int cache) | 540 | swp_entry_t entry, unsigned char usage) |
559 | { | 541 | { |
560 | unsigned long offset = swp_offset(ent); | 542 | unsigned long offset = swp_offset(entry); |
561 | int count = swap_count(p->swap_map[offset]); | 543 | unsigned char count; |
562 | bool has_cache; | 544 | unsigned char has_cache; |
563 | 545 | ||
564 | has_cache = swap_has_cache(p->swap_map[offset]); | 546 | count = p->swap_map[offset]; |
547 | has_cache = count & SWAP_HAS_CACHE; | ||
548 | count &= ~SWAP_HAS_CACHE; | ||
565 | 549 | ||
566 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ | 550 | if (usage == SWAP_HAS_CACHE) { |
567 | if (count < SWAP_MAP_MAX) { | ||
568 | count--; | ||
569 | p->swap_map[offset] = encode_swapmap(count, has_cache); | ||
570 | } | ||
571 | } else { /* dropping swap cache flag */ | ||
572 | VM_BUG_ON(!has_cache); | 551 | VM_BUG_ON(!has_cache); |
573 | p->swap_map[offset] = encode_swapmap(count, false); | 552 | has_cache = 0; |
574 | 553 | } else if (count == SWAP_MAP_SHMEM) { | |
554 | /* | ||
555 | * Or we could insist on shmem.c using a special | ||
556 | * swap_shmem_free() and free_shmem_swap_and_cache()... | ||
557 | */ | ||
558 | count = 0; | ||
559 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { | ||
560 | if (count == COUNT_CONTINUED) { | ||
561 | if (swap_count_continued(p, offset, count)) | ||
562 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
563 | else | ||
564 | count = SWAP_MAP_MAX; | ||
565 | } else | ||
566 | count--; | ||
575 | } | 567 | } |
576 | /* return code. */ | 568 | |
577 | count = p->swap_map[offset]; | 569 | if (!count) |
570 | mem_cgroup_uncharge_swap(entry); | ||
571 | |||
572 | usage = count | has_cache; | ||
573 | p->swap_map[offset] = usage; | ||
574 | |||
578 | /* free if no reference */ | 575 | /* free if no reference */ |
579 | if (!count) { | 576 | if (!usage) { |
580 | if (offset < p->lowest_bit) | 577 | if (offset < p->lowest_bit) |
581 | p->lowest_bit = offset; | 578 | p->lowest_bit = offset; |
582 | if (offset > p->highest_bit) | 579 | if (offset > p->highest_bit) |
583 | p->highest_bit = offset; | 580 | p->highest_bit = offset; |
584 | if (p->prio > swap_info[swap_list.next].prio) | 581 | if (swap_list.next >= 0 && |
585 | swap_list.next = p - swap_info; | 582 | p->prio > swap_info[swap_list.next]->prio) |
583 | swap_list.next = p->type; | ||
586 | nr_swap_pages++; | 584 | nr_swap_pages++; |
587 | p->inuse_pages--; | 585 | p->inuse_pages--; |
588 | } | 586 | } |
589 | if (!swap_count(count)) | 587 | |
590 | mem_cgroup_uncharge_swap(ent); | 588 | return usage; |
591 | return count; | ||
592 | } | 589 | } |
593 | 590 | ||
594 | /* | 591 | /* |
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, | |||
597 | */ | 594 | */ |
598 | void swap_free(swp_entry_t entry) | 595 | void swap_free(swp_entry_t entry) |
599 | { | 596 | { |
600 | struct swap_info_struct * p; | 597 | struct swap_info_struct *p; |
601 | 598 | ||
602 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
603 | if (p) { | 600 | if (p) { |
604 | swap_entry_free(p, entry, SWAP_MAP); | 601 | swap_entry_free(p, entry, 1); |
605 | spin_unlock(&swap_lock); | 602 | spin_unlock(&swap_lock); |
606 | } | 603 | } |
607 | } | 604 | } |
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) | |||
612 | void swapcache_free(swp_entry_t entry, struct page *page) | 609 | void swapcache_free(swp_entry_t entry, struct page *page) |
613 | { | 610 | { |
614 | struct swap_info_struct *p; | 611 | struct swap_info_struct *p; |
615 | int ret; | 612 | unsigned char count; |
616 | 613 | ||
617 | p = swap_info_get(entry); | 614 | p = swap_info_get(entry); |
618 | if (p) { | 615 | if (p) { |
619 | ret = swap_entry_free(p, entry, SWAP_CACHE); | 616 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
620 | if (page) { | 617 | if (page) |
621 | bool swapout; | 618 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
622 | if (ret) | ||
623 | swapout = true; /* the end of swap out */ | ||
624 | else | ||
625 | swapout = false; /* no more swap users! */ | ||
626 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
627 | } | ||
628 | spin_unlock(&swap_lock); | 619 | spin_unlock(&swap_lock); |
629 | } | 620 | } |
630 | return; | ||
631 | } | 621 | } |
632 | 622 | ||
633 | /* | 623 | /* |
634 | * How many references to page are currently swapped out? | 624 | * How many references to page are currently swapped out? |
625 | * This does not give an exact answer when swap count is continued, | ||
626 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
635 | */ | 627 | */ |
636 | static inline int page_swapcount(struct page *page) | 628 | static inline int page_swapcount(struct page *page) |
637 | { | 629 | { |
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) | |||
659 | int count; | 651 | int count; |
660 | 652 | ||
661 | VM_BUG_ON(!PageLocked(page)); | 653 | VM_BUG_ON(!PageLocked(page)); |
654 | if (unlikely(PageKsm(page))) | ||
655 | return 0; | ||
662 | count = page_mapcount(page); | 656 | count = page_mapcount(page); |
663 | if (count <= 1 && PageSwapCache(page)) { | 657 | if (count <= 1 && PageSwapCache(page)) { |
664 | count += page_swapcount(page); | 658 | count += page_swapcount(page); |
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) | |||
667 | SetPageDirty(page); | 661 | SetPageDirty(page); |
668 | } | 662 | } |
669 | } | 663 | } |
670 | return count == 1; | 664 | return count <= 1; |
671 | } | 665 | } |
672 | 666 | ||
673 | /* | 667 | /* |
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
704 | 698 | ||
705 | p = swap_info_get(entry); | 699 | p = swap_info_get(entry); |
706 | if (p) { | 700 | if (p) { |
707 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { | 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
708 | page = find_get_page(&swapper_space, entry.val); | 702 | page = find_get_page(&swapper_space, entry.val); |
709 | if (page && !trylock_page(page)) { | 703 | if (page && !trylock_page(page)) { |
710 | page_cache_release(page); | 704 | page_cache_release(page); |
@@ -729,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) | |||
729 | return p != NULL; | 723 | return p != NULL; |
730 | } | 724 | } |
731 | 725 | ||
726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
727 | /** | ||
728 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
729 | * @ent: the swap entry to be checked | ||
730 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
731 | * | ||
732 | * Returns the number of the user of the swap entry. The number is valid only | ||
733 | * for swaps of anonymous pages. | ||
734 | * If the entry is found on swap cache, the page is stored to pagep with | ||
735 | * refcount of it being incremented. | ||
736 | */ | ||
737 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
738 | { | ||
739 | struct page *page; | ||
740 | struct swap_info_struct *p; | ||
741 | int count = 0; | ||
742 | |||
743 | page = find_get_page(&swapper_space, ent.val); | ||
744 | if (page) | ||
745 | count += page_mapcount(page); | ||
746 | p = swap_info_get(ent); | ||
747 | if (p) { | ||
748 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
749 | spin_unlock(&swap_lock); | ||
750 | } | ||
751 | |||
752 | *pagep = page; | ||
753 | return count; | ||
754 | } | ||
755 | #endif | ||
756 | |||
732 | #ifdef CONFIG_HIBERNATION | 757 | #ifdef CONFIG_HIBERNATION |
733 | /* | 758 | /* |
734 | * Find the swap type that corresponds to given device (if any). | 759 | * Find the swap type that corresponds to given device (if any). |
@@ -741,14 +766,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
741 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | 766 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
742 | { | 767 | { |
743 | struct block_device *bdev = NULL; | 768 | struct block_device *bdev = NULL; |
744 | int i; | 769 | int type; |
745 | 770 | ||
746 | if (device) | 771 | if (device) |
747 | bdev = bdget(device); | 772 | bdev = bdget(device); |
748 | 773 | ||
749 | spin_lock(&swap_lock); | 774 | spin_lock(&swap_lock); |
750 | for (i = 0; i < nr_swapfiles; i++) { | 775 | for (type = 0; type < nr_swapfiles; type++) { |
751 | struct swap_info_struct *sis = swap_info + i; | 776 | struct swap_info_struct *sis = swap_info[type]; |
752 | 777 | ||
753 | if (!(sis->flags & SWP_WRITEOK)) | 778 | if (!(sis->flags & SWP_WRITEOK)) |
754 | continue; | 779 | continue; |
@@ -758,20 +783,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
758 | *bdev_p = bdgrab(sis->bdev); | 783 | *bdev_p = bdgrab(sis->bdev); |
759 | 784 | ||
760 | spin_unlock(&swap_lock); | 785 | spin_unlock(&swap_lock); |
761 | return i; | 786 | return type; |
762 | } | 787 | } |
763 | if (bdev == sis->bdev) { | 788 | if (bdev == sis->bdev) { |
764 | struct swap_extent *se; | 789 | struct swap_extent *se = &sis->first_swap_extent; |
765 | 790 | ||
766 | se = list_entry(sis->extent_list.next, | ||
767 | struct swap_extent, list); | ||
768 | if (se->start_block == offset) { | 791 | if (se->start_block == offset) { |
769 | if (bdev_p) | 792 | if (bdev_p) |
770 | *bdev_p = bdgrab(sis->bdev); | 793 | *bdev_p = bdgrab(sis->bdev); |
771 | 794 | ||
772 | spin_unlock(&swap_lock); | 795 | spin_unlock(&swap_lock); |
773 | bdput(bdev); | 796 | bdput(bdev); |
774 | return i; | 797 | return type; |
775 | } | 798 | } |
776 | } | 799 | } |
777 | } | 800 | } |
@@ -783,6 +806,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
783 | } | 806 | } |
784 | 807 | ||
785 | /* | 808 | /* |
809 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
810 | * corresponding to given index in swap_info (swap type). | ||
811 | */ | ||
812 | sector_t swapdev_block(int type, pgoff_t offset) | ||
813 | { | ||
814 | struct block_device *bdev; | ||
815 | |||
816 | if ((unsigned int)type >= nr_swapfiles) | ||
817 | return 0; | ||
818 | if (!(swap_info[type]->flags & SWP_WRITEOK)) | ||
819 | return 0; | ||
820 | return map_swap_entry(swp_entry(type, offset), &bdev); | ||
821 | } | ||
822 | |||
823 | /* | ||
786 | * Return either the total number of swap pages of given type, or the number | 824 | * Return either the total number of swap pages of given type, or the number |
787 | * of free pages of that type (depending on @free) | 825 | * of free pages of that type (depending on @free) |
788 | * | 826 | * |
@@ -792,18 +830,20 @@ unsigned int count_swap_pages(int type, int free) | |||
792 | { | 830 | { |
793 | unsigned int n = 0; | 831 | unsigned int n = 0; |
794 | 832 | ||
795 | if (type < nr_swapfiles) { | 833 | spin_lock(&swap_lock); |
796 | spin_lock(&swap_lock); | 834 | if ((unsigned int)type < nr_swapfiles) { |
797 | if (swap_info[type].flags & SWP_WRITEOK) { | 835 | struct swap_info_struct *sis = swap_info[type]; |
798 | n = swap_info[type].pages; | 836 | |
837 | if (sis->flags & SWP_WRITEOK) { | ||
838 | n = sis->pages; | ||
799 | if (free) | 839 | if (free) |
800 | n -= swap_info[type].inuse_pages; | 840 | n -= sis->inuse_pages; |
801 | } | 841 | } |
802 | spin_unlock(&swap_lock); | ||
803 | } | 842 | } |
843 | spin_unlock(&swap_lock); | ||
804 | return n; | 844 | return n; |
805 | } | 845 | } |
806 | #endif | 846 | #endif /* CONFIG_HIBERNATION */ |
807 | 847 | ||
808 | /* | 848 | /* |
809 | * No need to decide whether this PTE shares the swap entry with others, | 849 | * No need to decide whether this PTE shares the swap entry with others, |
@@ -831,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
831 | goto out; | 871 | goto out; |
832 | } | 872 | } |
833 | 873 | ||
834 | inc_mm_counter(vma->vm_mm, anon_rss); | 874 | dec_mm_counter(vma->vm_mm, MM_SWAPENTS); |
875 | inc_mm_counter(vma->vm_mm, MM_ANONPAGES); | ||
835 | get_page(page); | 876 | get_page(page); |
836 | set_pte_at(vma->vm_mm, addr, pte, | 877 | set_pte_at(vma->vm_mm, addr, pte, |
837 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 878 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
@@ -932,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
932 | unsigned long addr, end, next; | 973 | unsigned long addr, end, next; |
933 | int ret; | 974 | int ret; |
934 | 975 | ||
935 | if (page->mapping) { | 976 | if (page_anon_vma(page)) { |
936 | addr = page_address_in_vma(page, vma); | 977 | addr = page_address_in_vma(page, vma); |
937 | if (addr == -EFAULT) | 978 | if (addr == -EFAULT) |
938 | return 0; | 979 | return 0; |
@@ -988,7 +1029,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
988 | { | 1029 | { |
989 | unsigned int max = si->max; | 1030 | unsigned int max = si->max; |
990 | unsigned int i = prev; | 1031 | unsigned int i = prev; |
991 | int count; | 1032 | unsigned char count; |
992 | 1033 | ||
993 | /* | 1034 | /* |
994 | * No need for swap_lock here: we're just looking | 1035 | * No need for swap_lock here: we're just looking |
@@ -1024,16 +1065,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1024 | */ | 1065 | */ |
1025 | static int try_to_unuse(unsigned int type) | 1066 | static int try_to_unuse(unsigned int type) |
1026 | { | 1067 | { |
1027 | struct swap_info_struct * si = &swap_info[type]; | 1068 | struct swap_info_struct *si = swap_info[type]; |
1028 | struct mm_struct *start_mm; | 1069 | struct mm_struct *start_mm; |
1029 | unsigned short *swap_map; | 1070 | unsigned char *swap_map; |
1030 | unsigned short swcount; | 1071 | unsigned char swcount; |
1031 | struct page *page; | 1072 | struct page *page; |
1032 | swp_entry_t entry; | 1073 | swp_entry_t entry; |
1033 | unsigned int i = 0; | 1074 | unsigned int i = 0; |
1034 | int retval = 0; | 1075 | int retval = 0; |
1035 | int reset_overflow = 0; | ||
1036 | int shmem; | ||
1037 | 1076 | ||
1038 | /* | 1077 | /* |
1039 | * When searching mms for an entry, a good strategy is to | 1078 | * When searching mms for an entry, a good strategy is to |
@@ -1047,8 +1086,7 @@ static int try_to_unuse(unsigned int type) | |||
1047 | * together, child after parent. If we race with dup_mmap(), we | 1086 | * together, child after parent. If we race with dup_mmap(), we |
1048 | * prefer to resolve parent before child, lest we miss entries | 1087 | * prefer to resolve parent before child, lest we miss entries |
1049 | * duplicated after we scanned child: using last mm would invert | 1088 | * duplicated after we scanned child: using last mm would invert |
1050 | * that. Though it's only a serious concern when an overflowed | 1089 | * that. |
1051 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
1052 | */ | 1090 | */ |
1053 | start_mm = &init_mm; | 1091 | start_mm = &init_mm; |
1054 | atomic_inc(&init_mm.mm_users); | 1092 | atomic_inc(&init_mm.mm_users); |
@@ -1110,17 +1148,18 @@ static int try_to_unuse(unsigned int type) | |||
1110 | 1148 | ||
1111 | /* | 1149 | /* |
1112 | * Remove all references to entry. | 1150 | * Remove all references to entry. |
1113 | * Whenever we reach init_mm, there's no address space | ||
1114 | * to search, but use it as a reminder to search shmem. | ||
1115 | */ | 1151 | */ |
1116 | shmem = 0; | ||
1117 | swcount = *swap_map; | 1152 | swcount = *swap_map; |
1118 | if (swap_count(swcount)) { | 1153 | if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
1119 | if (start_mm == &init_mm) | 1154 | retval = shmem_unuse(entry, page); |
1120 | shmem = shmem_unuse(entry, page); | 1155 | /* page has already been unlocked and released */ |
1121 | else | 1156 | if (retval < 0) |
1122 | retval = unuse_mm(start_mm, entry, page); | 1157 | break; |
1158 | continue; | ||
1123 | } | 1159 | } |
1160 | if (swap_count(swcount) && start_mm != &init_mm) | ||
1161 | retval = unuse_mm(start_mm, entry, page); | ||
1162 | |||
1124 | if (swap_count(*swap_map)) { | 1163 | if (swap_count(*swap_map)) { |
1125 | int set_start_mm = (*swap_map >= swcount); | 1164 | int set_start_mm = (*swap_map >= swcount); |
1126 | struct list_head *p = &start_mm->mmlist; | 1165 | struct list_head *p = &start_mm->mmlist; |
@@ -1131,7 +1170,7 @@ static int try_to_unuse(unsigned int type) | |||
1131 | atomic_inc(&new_start_mm->mm_users); | 1170 | atomic_inc(&new_start_mm->mm_users); |
1132 | atomic_inc(&prev_mm->mm_users); | 1171 | atomic_inc(&prev_mm->mm_users); |
1133 | spin_lock(&mmlist_lock); | 1172 | spin_lock(&mmlist_lock); |
1134 | while (swap_count(*swap_map) && !retval && !shmem && | 1173 | while (swap_count(*swap_map) && !retval && |
1135 | (p = p->next) != &start_mm->mmlist) { | 1174 | (p = p->next) != &start_mm->mmlist) { |
1136 | mm = list_entry(p, struct mm_struct, mmlist); | 1175 | mm = list_entry(p, struct mm_struct, mmlist); |
1137 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1176 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1145,10 +1184,9 @@ static int try_to_unuse(unsigned int type) | |||
1145 | swcount = *swap_map; | 1184 | swcount = *swap_map; |
1146 | if (!swap_count(swcount)) /* any usage ? */ | 1185 | if (!swap_count(swcount)) /* any usage ? */ |
1147 | ; | 1186 | ; |
1148 | else if (mm == &init_mm) { | 1187 | else if (mm == &init_mm) |
1149 | set_start_mm = 1; | 1188 | set_start_mm = 1; |
1150 | shmem = shmem_unuse(entry, page); | 1189 | else |
1151 | } else | ||
1152 | retval = unuse_mm(mm, entry, page); | 1190 | retval = unuse_mm(mm, entry, page); |
1153 | 1191 | ||
1154 | if (set_start_mm && *swap_map < swcount) { | 1192 | if (set_start_mm && *swap_map < swcount) { |
@@ -1164,13 +1202,6 @@ static int try_to_unuse(unsigned int type) | |||
1164 | mmput(start_mm); | 1202 | mmput(start_mm); |
1165 | start_mm = new_start_mm; | 1203 | start_mm = new_start_mm; |
1166 | } | 1204 | } |
1167 | if (shmem) { | ||
1168 | /* page has already been unlocked and released */ | ||
1169 | if (shmem > 0) | ||
1170 | continue; | ||
1171 | retval = shmem; | ||
1172 | break; | ||
1173 | } | ||
1174 | if (retval) { | 1205 | if (retval) { |
1175 | unlock_page(page); | 1206 | unlock_page(page); |
1176 | page_cache_release(page); | 1207 | page_cache_release(page); |
@@ -1178,30 +1209,6 @@ static int try_to_unuse(unsigned int type) | |||
1178 | } | 1209 | } |
1179 | 1210 | ||
1180 | /* | 1211 | /* |
1181 | * How could swap count reach 0x7ffe ? | ||
1182 | * There's no way to repeat a swap page within an mm | ||
1183 | * (except in shmem, where it's the shared object which takes | ||
1184 | * the reference count)? | ||
1185 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
1186 | * short is too small....) | ||
1187 | * If that's wrong, then we should worry more about | ||
1188 | * exit_mmap() and do_munmap() cases described above: | ||
1189 | * we might be resetting SWAP_MAP_MAX too early here. | ||
1190 | * We know "Undead"s can happen, they're okay, so don't | ||
1191 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
1192 | */ | ||
1193 | /* We might release the lock_page() in unuse_mm(). */ | ||
1194 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1195 | goto retry; | ||
1196 | |||
1197 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1198 | spin_lock(&swap_lock); | ||
1199 | *swap_map = encode_swapmap(0, true); | ||
1200 | spin_unlock(&swap_lock); | ||
1201 | reset_overflow = 1; | ||
1202 | } | ||
1203 | |||
1204 | /* | ||
1205 | * If a reference remains (rare), we would like to leave | 1212 | * If a reference remains (rare), we would like to leave |
1206 | * the page in the swap cache; but try_to_unmap could | 1213 | * the page in the swap cache; but try_to_unmap could |
1207 | * then re-duplicate the entry once we drop page lock, | 1214 | * then re-duplicate the entry once we drop page lock, |
@@ -1213,6 +1220,12 @@ static int try_to_unuse(unsigned int type) | |||
1213 | * read from disk into another page. Splitting into two | 1220 | * read from disk into another page. Splitting into two |
1214 | * pages would be incorrect if swap supported "shared | 1221 | * pages would be incorrect if swap supported "shared |
1215 | * private" pages, but they are handled by tmpfs files. | 1222 | * private" pages, but they are handled by tmpfs files. |
1223 | * | ||
1224 | * Given how unuse_vma() targets one particular offset | ||
1225 | * in an anon_vma, once the anon_vma has been determined, | ||
1226 | * this splitting happens to be just what is needed to | ||
1227 | * handle where KSM pages have been swapped out: re-reading | ||
1228 | * is unnecessarily slow, but we can fix that later on. | ||
1216 | */ | 1229 | */ |
1217 | if (swap_count(*swap_map) && | 1230 | if (swap_count(*swap_map) && |
1218 | PageDirty(page) && PageSwapCache(page)) { | 1231 | PageDirty(page) && PageSwapCache(page)) { |
@@ -1242,7 +1255,6 @@ static int try_to_unuse(unsigned int type) | |||
1242 | * mark page dirty so shrink_page_list will preserve it. | 1255 | * mark page dirty so shrink_page_list will preserve it. |
1243 | */ | 1256 | */ |
1244 | SetPageDirty(page); | 1257 | SetPageDirty(page); |
1245 | retry: | ||
1246 | unlock_page(page); | 1258 | unlock_page(page); |
1247 | page_cache_release(page); | 1259 | page_cache_release(page); |
1248 | 1260 | ||
@@ -1254,10 +1266,6 @@ retry: | |||
1254 | } | 1266 | } |
1255 | 1267 | ||
1256 | mmput(start_mm); | 1268 | mmput(start_mm); |
1257 | if (reset_overflow) { | ||
1258 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
1259 | swap_overflow = 0; | ||
1260 | } | ||
1261 | return retval; | 1269 | return retval; |
1262 | } | 1270 | } |
1263 | 1271 | ||
@@ -1270,10 +1278,10 @@ retry: | |||
1270 | static void drain_mmlist(void) | 1278 | static void drain_mmlist(void) |
1271 | { | 1279 | { |
1272 | struct list_head *p, *next; | 1280 | struct list_head *p, *next; |
1273 | unsigned int i; | 1281 | unsigned int type; |
1274 | 1282 | ||
1275 | for (i = 0; i < nr_swapfiles; i++) | 1283 | for (type = 0; type < nr_swapfiles; type++) |
1276 | if (swap_info[i].inuse_pages) | 1284 | if (swap_info[type]->inuse_pages) |
1277 | return; | 1285 | return; |
1278 | spin_lock(&mmlist_lock); | 1286 | spin_lock(&mmlist_lock); |
1279 | list_for_each_safe(p, next, &init_mm.mmlist) | 1287 | list_for_each_safe(p, next, &init_mm.mmlist) |
@@ -1283,12 +1291,23 @@ static void drain_mmlist(void) | |||
1283 | 1291 | ||
1284 | /* | 1292 | /* |
1285 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | 1293 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which |
1286 | * corresponds to page offset `offset'. | 1294 | * corresponds to page offset for the specified swap entry. |
1295 | * Note that the type of this function is sector_t, but it returns page offset | ||
1296 | * into the bdev, not sector offset. | ||
1287 | */ | 1297 | */ |
1288 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | 1298 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
1289 | { | 1299 | { |
1290 | struct swap_extent *se = sis->curr_swap_extent; | 1300 | struct swap_info_struct *sis; |
1291 | struct swap_extent *start_se = se; | 1301 | struct swap_extent *start_se; |
1302 | struct swap_extent *se; | ||
1303 | pgoff_t offset; | ||
1304 | |||
1305 | sis = swap_info[swp_type(entry)]; | ||
1306 | *bdev = sis->bdev; | ||
1307 | |||
1308 | offset = swp_offset(entry); | ||
1309 | start_se = sis->curr_swap_extent; | ||
1310 | se = start_se; | ||
1292 | 1311 | ||
1293 | for ( ; ; ) { | 1312 | for ( ; ; ) { |
1294 | struct list_head *lh; | 1313 | struct list_head *lh; |
@@ -1298,40 +1317,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
1298 | return se->start_block + (offset - se->start_page); | 1317 | return se->start_block + (offset - se->start_page); |
1299 | } | 1318 | } |
1300 | lh = se->list.next; | 1319 | lh = se->list.next; |
1301 | if (lh == &sis->extent_list) | ||
1302 | lh = lh->next; | ||
1303 | se = list_entry(lh, struct swap_extent, list); | 1320 | se = list_entry(lh, struct swap_extent, list); |
1304 | sis->curr_swap_extent = se; | 1321 | sis->curr_swap_extent = se; |
1305 | BUG_ON(se == start_se); /* It *must* be present */ | 1322 | BUG_ON(se == start_se); /* It *must* be present */ |
1306 | } | 1323 | } |
1307 | } | 1324 | } |
1308 | 1325 | ||
1309 | #ifdef CONFIG_HIBERNATION | ||
1310 | /* | 1326 | /* |
1311 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | 1327 | * Returns the page offset into bdev for the specified page's swap entry. |
1312 | * corresponding to given index in swap_info (swap type). | ||
1313 | */ | 1328 | */ |
1314 | sector_t swapdev_block(int swap_type, pgoff_t offset) | 1329 | sector_t map_swap_page(struct page *page, struct block_device **bdev) |
1315 | { | 1330 | { |
1316 | struct swap_info_struct *sis; | 1331 | swp_entry_t entry; |
1317 | 1332 | entry.val = page_private(page); | |
1318 | if (swap_type >= nr_swapfiles) | 1333 | return map_swap_entry(entry, bdev); |
1319 | return 0; | ||
1320 | |||
1321 | sis = swap_info + swap_type; | ||
1322 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
1323 | } | 1334 | } |
1324 | #endif /* CONFIG_HIBERNATION */ | ||
1325 | 1335 | ||
1326 | /* | 1336 | /* |
1327 | * Free all of a swapdev's extent information | 1337 | * Free all of a swapdev's extent information |
1328 | */ | 1338 | */ |
1329 | static void destroy_swap_extents(struct swap_info_struct *sis) | 1339 | static void destroy_swap_extents(struct swap_info_struct *sis) |
1330 | { | 1340 | { |
1331 | while (!list_empty(&sis->extent_list)) { | 1341 | while (!list_empty(&sis->first_swap_extent.list)) { |
1332 | struct swap_extent *se; | 1342 | struct swap_extent *se; |
1333 | 1343 | ||
1334 | se = list_entry(sis->extent_list.next, | 1344 | se = list_entry(sis->first_swap_extent.list.next, |
1335 | struct swap_extent, list); | 1345 | struct swap_extent, list); |
1336 | list_del(&se->list); | 1346 | list_del(&se->list); |
1337 | kfree(se); | 1347 | kfree(se); |
@@ -1352,8 +1362,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1352 | struct swap_extent *new_se; | 1362 | struct swap_extent *new_se; |
1353 | struct list_head *lh; | 1363 | struct list_head *lh; |
1354 | 1364 | ||
1355 | lh = sis->extent_list.prev; /* The highest page extent */ | 1365 | if (start_page == 0) { |
1356 | if (lh != &sis->extent_list) { | 1366 | se = &sis->first_swap_extent; |
1367 | sis->curr_swap_extent = se; | ||
1368 | se->start_page = 0; | ||
1369 | se->nr_pages = nr_pages; | ||
1370 | se->start_block = start_block; | ||
1371 | return 1; | ||
1372 | } else { | ||
1373 | lh = sis->first_swap_extent.list.prev; /* Highest extent */ | ||
1357 | se = list_entry(lh, struct swap_extent, list); | 1374 | se = list_entry(lh, struct swap_extent, list); |
1358 | BUG_ON(se->start_page + se->nr_pages != start_page); | 1375 | BUG_ON(se->start_page + se->nr_pages != start_page); |
1359 | if (se->start_block + se->nr_pages == start_block) { | 1376 | if (se->start_block + se->nr_pages == start_block) { |
@@ -1373,7 +1390,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1373 | new_se->nr_pages = nr_pages; | 1390 | new_se->nr_pages = nr_pages; |
1374 | new_se->start_block = start_block; | 1391 | new_se->start_block = start_block; |
1375 | 1392 | ||
1376 | list_add_tail(&new_se->list, &sis->extent_list); | 1393 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
1377 | return 1; | 1394 | return 1; |
1378 | } | 1395 | } |
1379 | 1396 | ||
@@ -1425,7 +1442,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1425 | if (S_ISBLK(inode->i_mode)) { | 1442 | if (S_ISBLK(inode->i_mode)) { |
1426 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1443 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1427 | *span = sis->pages; | 1444 | *span = sis->pages; |
1428 | goto done; | 1445 | goto out; |
1429 | } | 1446 | } |
1430 | 1447 | ||
1431 | blkbits = inode->i_blkbits; | 1448 | blkbits = inode->i_blkbits; |
@@ -1496,25 +1513,22 @@ reprobe: | |||
1496 | sis->max = page_no; | 1513 | sis->max = page_no; |
1497 | sis->pages = page_no - 1; | 1514 | sis->pages = page_no - 1; |
1498 | sis->highest_bit = page_no - 1; | 1515 | sis->highest_bit = page_no - 1; |
1499 | done: | 1516 | out: |
1500 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | 1517 | return ret; |
1501 | struct swap_extent, list); | ||
1502 | goto out; | ||
1503 | bad_bmap: | 1518 | bad_bmap: |
1504 | printk(KERN_ERR "swapon: swapfile has holes\n"); | 1519 | printk(KERN_ERR "swapon: swapfile has holes\n"); |
1505 | ret = -EINVAL; | 1520 | ret = -EINVAL; |
1506 | out: | 1521 | goto out; |
1507 | return ret; | ||
1508 | } | 1522 | } |
1509 | 1523 | ||
1510 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1524 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1511 | { | 1525 | { |
1512 | struct swap_info_struct * p = NULL; | 1526 | struct swap_info_struct *p = NULL; |
1513 | unsigned short *swap_map; | 1527 | unsigned char *swap_map; |
1514 | struct file *swap_file, *victim; | 1528 | struct file *swap_file, *victim; |
1515 | struct address_space *mapping; | 1529 | struct address_space *mapping; |
1516 | struct inode *inode; | 1530 | struct inode *inode; |
1517 | char * pathname; | 1531 | char *pathname; |
1518 | int i, type, prev; | 1532 | int i, type, prev; |
1519 | int err; | 1533 | int err; |
1520 | 1534 | ||
@@ -1535,8 +1549,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1535 | mapping = victim->f_mapping; | 1549 | mapping = victim->f_mapping; |
1536 | prev = -1; | 1550 | prev = -1; |
1537 | spin_lock(&swap_lock); | 1551 | spin_lock(&swap_lock); |
1538 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1552 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { |
1539 | p = swap_info + type; | 1553 | p = swap_info[type]; |
1540 | if (p->flags & SWP_WRITEOK) { | 1554 | if (p->flags & SWP_WRITEOK) { |
1541 | if (p->swap_file->f_mapping == mapping) | 1555 | if (p->swap_file->f_mapping == mapping) |
1542 | break; | 1556 | break; |
@@ -1555,18 +1569,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1555 | spin_unlock(&swap_lock); | 1569 | spin_unlock(&swap_lock); |
1556 | goto out_dput; | 1570 | goto out_dput; |
1557 | } | 1571 | } |
1558 | if (prev < 0) { | 1572 | if (prev < 0) |
1559 | swap_list.head = p->next; | 1573 | swap_list.head = p->next; |
1560 | } else { | 1574 | else |
1561 | swap_info[prev].next = p->next; | 1575 | swap_info[prev]->next = p->next; |
1562 | } | ||
1563 | if (type == swap_list.next) { | 1576 | if (type == swap_list.next) { |
1564 | /* just pick something that's safe... */ | 1577 | /* just pick something that's safe... */ |
1565 | swap_list.next = swap_list.head; | 1578 | swap_list.next = swap_list.head; |
1566 | } | 1579 | } |
1567 | if (p->prio < 0) { | 1580 | if (p->prio < 0) { |
1568 | for (i = p->next; i >= 0; i = swap_info[i].next) | 1581 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1569 | swap_info[i].prio = p->prio--; | 1582 | swap_info[i]->prio = p->prio--; |
1570 | least_priority++; | 1583 | least_priority++; |
1571 | } | 1584 | } |
1572 | nr_swap_pages -= p->pages; | 1585 | nr_swap_pages -= p->pages; |
@@ -1584,16 +1597,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1584 | if (p->prio < 0) | 1597 | if (p->prio < 0) |
1585 | p->prio = --least_priority; | 1598 | p->prio = --least_priority; |
1586 | prev = -1; | 1599 | prev = -1; |
1587 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 1600 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
1588 | if (p->prio >= swap_info[i].prio) | 1601 | if (p->prio >= swap_info[i]->prio) |
1589 | break; | 1602 | break; |
1590 | prev = i; | 1603 | prev = i; |
1591 | } | 1604 | } |
1592 | p->next = i; | 1605 | p->next = i; |
1593 | if (prev < 0) | 1606 | if (prev < 0) |
1594 | swap_list.head = swap_list.next = p - swap_info; | 1607 | swap_list.head = swap_list.next = type; |
1595 | else | 1608 | else |
1596 | swap_info[prev].next = p - swap_info; | 1609 | swap_info[prev]->next = type; |
1597 | nr_swap_pages += p->pages; | 1610 | nr_swap_pages += p->pages; |
1598 | total_swap_pages += p->pages; | 1611 | total_swap_pages += p->pages; |
1599 | p->flags |= SWP_WRITEOK; | 1612 | p->flags |= SWP_WRITEOK; |
@@ -1606,6 +1619,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1606 | up_write(&swap_unplug_sem); | 1619 | up_write(&swap_unplug_sem); |
1607 | 1620 | ||
1608 | destroy_swap_extents(p); | 1621 | destroy_swap_extents(p); |
1622 | if (p->flags & SWP_CONTINUED) | ||
1623 | free_swap_count_continuations(p); | ||
1624 | |||
1609 | mutex_lock(&swapon_mutex); | 1625 | mutex_lock(&swapon_mutex); |
1610 | spin_lock(&swap_lock); | 1626 | spin_lock(&swap_lock); |
1611 | drain_mmlist(); | 1627 | drain_mmlist(); |
@@ -1653,8 +1669,8 @@ out: | |||
1653 | /* iterator */ | 1669 | /* iterator */ |
1654 | static void *swap_start(struct seq_file *swap, loff_t *pos) | 1670 | static void *swap_start(struct seq_file *swap, loff_t *pos) |
1655 | { | 1671 | { |
1656 | struct swap_info_struct *ptr = swap_info; | 1672 | struct swap_info_struct *si; |
1657 | int i; | 1673 | int type; |
1658 | loff_t l = *pos; | 1674 | loff_t l = *pos; |
1659 | 1675 | ||
1660 | mutex_lock(&swapon_mutex); | 1676 | mutex_lock(&swapon_mutex); |
@@ -1662,11 +1678,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1662 | if (!l) | 1678 | if (!l) |
1663 | return SEQ_START_TOKEN; | 1679 | return SEQ_START_TOKEN; |
1664 | 1680 | ||
1665 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1681 | for (type = 0; type < nr_swapfiles; type++) { |
1666 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1682 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1683 | si = swap_info[type]; | ||
1684 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1667 | continue; | 1685 | continue; |
1668 | if (!--l) | 1686 | if (!--l) |
1669 | return ptr; | 1687 | return si; |
1670 | } | 1688 | } |
1671 | 1689 | ||
1672 | return NULL; | 1690 | return NULL; |
@@ -1674,21 +1692,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1674 | 1692 | ||
1675 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1693 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1676 | { | 1694 | { |
1677 | struct swap_info_struct *ptr; | 1695 | struct swap_info_struct *si = v; |
1678 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1696 | int type; |
1679 | 1697 | ||
1680 | if (v == SEQ_START_TOKEN) | 1698 | if (v == SEQ_START_TOKEN) |
1681 | ptr = swap_info; | 1699 | type = 0; |
1682 | else { | 1700 | else |
1683 | ptr = v; | 1701 | type = si->type + 1; |
1684 | ptr++; | ||
1685 | } | ||
1686 | 1702 | ||
1687 | for (; ptr < endptr; ptr++) { | 1703 | for (; type < nr_swapfiles; type++) { |
1688 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1704 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1705 | si = swap_info[type]; | ||
1706 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1689 | continue; | 1707 | continue; |
1690 | ++*pos; | 1708 | ++*pos; |
1691 | return ptr; | 1709 | return si; |
1692 | } | 1710 | } |
1693 | 1711 | ||
1694 | return NULL; | 1712 | return NULL; |
@@ -1701,24 +1719,24 @@ static void swap_stop(struct seq_file *swap, void *v) | |||
1701 | 1719 | ||
1702 | static int swap_show(struct seq_file *swap, void *v) | 1720 | static int swap_show(struct seq_file *swap, void *v) |
1703 | { | 1721 | { |
1704 | struct swap_info_struct *ptr = v; | 1722 | struct swap_info_struct *si = v; |
1705 | struct file *file; | 1723 | struct file *file; |
1706 | int len; | 1724 | int len; |
1707 | 1725 | ||
1708 | if (ptr == SEQ_START_TOKEN) { | 1726 | if (si == SEQ_START_TOKEN) { |
1709 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1727 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1710 | return 0; | 1728 | return 0; |
1711 | } | 1729 | } |
1712 | 1730 | ||
1713 | file = ptr->swap_file; | 1731 | file = si->swap_file; |
1714 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1732 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1715 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1733 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1716 | len < 40 ? 40 - len : 1, " ", | 1734 | len < 40 ? 40 - len : 1, " ", |
1717 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1735 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1718 | "partition" : "file\t", | 1736 | "partition" : "file\t", |
1719 | ptr->pages << (PAGE_SHIFT - 10), | 1737 | si->pages << (PAGE_SHIFT - 10), |
1720 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1738 | si->inuse_pages << (PAGE_SHIFT - 10), |
1721 | ptr->prio); | 1739 | si->prio); |
1722 | return 0; | 1740 | return 0; |
1723 | } | 1741 | } |
1724 | 1742 | ||
@@ -1765,7 +1783,7 @@ late_initcall(max_swapfiles_check); | |||
1765 | */ | 1783 | */ |
1766 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1784 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1767 | { | 1785 | { |
1768 | struct swap_info_struct * p; | 1786 | struct swap_info_struct *p; |
1769 | char *name = NULL; | 1787 | char *name = NULL; |
1770 | struct block_device *bdev = NULL; | 1788 | struct block_device *bdev = NULL; |
1771 | struct file *swap_file = NULL; | 1789 | struct file *swap_file = NULL; |
@@ -1773,36 +1791,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1773 | unsigned int type; | 1791 | unsigned int type; |
1774 | int i, prev; | 1792 | int i, prev; |
1775 | int error; | 1793 | int error; |
1776 | union swap_header *swap_header = NULL; | 1794 | union swap_header *swap_header; |
1777 | unsigned int nr_good_pages = 0; | 1795 | unsigned int nr_good_pages; |
1778 | int nr_extents = 0; | 1796 | int nr_extents = 0; |
1779 | sector_t span; | 1797 | sector_t span; |
1780 | unsigned long maxpages = 1; | 1798 | unsigned long maxpages; |
1781 | unsigned long swapfilepages; | 1799 | unsigned long swapfilepages; |
1782 | unsigned short *swap_map = NULL; | 1800 | unsigned char *swap_map = NULL; |
1783 | struct page *page = NULL; | 1801 | struct page *page = NULL; |
1784 | struct inode *inode = NULL; | 1802 | struct inode *inode = NULL; |
1785 | int did_down = 0; | 1803 | int did_down = 0; |
1786 | 1804 | ||
1787 | if (!capable(CAP_SYS_ADMIN)) | 1805 | if (!capable(CAP_SYS_ADMIN)) |
1788 | return -EPERM; | 1806 | return -EPERM; |
1807 | |||
1808 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
1809 | if (!p) | ||
1810 | return -ENOMEM; | ||
1811 | |||
1789 | spin_lock(&swap_lock); | 1812 | spin_lock(&swap_lock); |
1790 | p = swap_info; | 1813 | for (type = 0; type < nr_swapfiles; type++) { |
1791 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | 1814 | if (!(swap_info[type]->flags & SWP_USED)) |
1792 | if (!(p->flags & SWP_USED)) | ||
1793 | break; | 1815 | break; |
1816 | } | ||
1794 | error = -EPERM; | 1817 | error = -EPERM; |
1795 | if (type >= MAX_SWAPFILES) { | 1818 | if (type >= MAX_SWAPFILES) { |
1796 | spin_unlock(&swap_lock); | 1819 | spin_unlock(&swap_lock); |
1820 | kfree(p); | ||
1797 | goto out; | 1821 | goto out; |
1798 | } | 1822 | } |
1799 | if (type >= nr_swapfiles) | 1823 | if (type >= nr_swapfiles) { |
1800 | nr_swapfiles = type+1; | 1824 | p->type = type; |
1801 | memset(p, 0, sizeof(*p)); | 1825 | swap_info[type] = p; |
1802 | INIT_LIST_HEAD(&p->extent_list); | 1826 | /* |
1827 | * Write swap_info[type] before nr_swapfiles, in case a | ||
1828 | * racing procfs swap_start() or swap_next() is reading them. | ||
1829 | * (We never shrink nr_swapfiles, we never free this entry.) | ||
1830 | */ | ||
1831 | smp_wmb(); | ||
1832 | nr_swapfiles++; | ||
1833 | } else { | ||
1834 | kfree(p); | ||
1835 | p = swap_info[type]; | ||
1836 | /* | ||
1837 | * Do not memset this entry: a racing procfs swap_next() | ||
1838 | * would be relying on p->type to remain valid. | ||
1839 | */ | ||
1840 | } | ||
1841 | INIT_LIST_HEAD(&p->first_swap_extent.list); | ||
1803 | p->flags = SWP_USED; | 1842 | p->flags = SWP_USED; |
1804 | p->next = -1; | 1843 | p->next = -1; |
1805 | spin_unlock(&swap_lock); | 1844 | spin_unlock(&swap_lock); |
1845 | |||
1806 | name = getname(specialfile); | 1846 | name = getname(specialfile); |
1807 | error = PTR_ERR(name); | 1847 | error = PTR_ERR(name); |
1808 | if (IS_ERR(name)) { | 1848 | if (IS_ERR(name)) { |
@@ -1822,7 +1862,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1822 | 1862 | ||
1823 | error = -EBUSY; | 1863 | error = -EBUSY; |
1824 | for (i = 0; i < nr_swapfiles; i++) { | 1864 | for (i = 0; i < nr_swapfiles; i++) { |
1825 | struct swap_info_struct *q = &swap_info[i]; | 1865 | struct swap_info_struct *q = swap_info[i]; |
1826 | 1866 | ||
1827 | if (i == type || !q->swap_file) | 1867 | if (i == type || !q->swap_file) |
1828 | continue; | 1868 | continue; |
@@ -1897,6 +1937,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1897 | 1937 | ||
1898 | p->lowest_bit = 1; | 1938 | p->lowest_bit = 1; |
1899 | p->cluster_next = 1; | 1939 | p->cluster_next = 1; |
1940 | p->cluster_nr = 0; | ||
1900 | 1941 | ||
1901 | /* | 1942 | /* |
1902 | * Find out how many pages are allowed for a single swap | 1943 | * Find out how many pages are allowed for a single swap |
@@ -1913,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1913 | * swap pte. | 1954 | * swap pte. |
1914 | */ | 1955 | */ |
1915 | maxpages = swp_offset(pte_to_swp_entry( | 1956 | maxpages = swp_offset(pte_to_swp_entry( |
1916 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; | 1957 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1917 | if (maxpages > swap_header->info.last_page) | 1958 | if (maxpages > swap_header->info.last_page) { |
1918 | maxpages = swap_header->info.last_page; | 1959 | maxpages = swap_header->info.last_page + 1; |
1960 | /* p->max is an unsigned int: don't overflow it */ | ||
1961 | if ((unsigned int)maxpages == 0) | ||
1962 | maxpages = UINT_MAX; | ||
1963 | } | ||
1919 | p->highest_bit = maxpages - 1; | 1964 | p->highest_bit = maxpages - 1; |
1920 | 1965 | ||
1921 | error = -EINVAL; | 1966 | error = -EINVAL; |
@@ -1932,30 +1977,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1932 | goto bad_swap; | 1977 | goto bad_swap; |
1933 | 1978 | ||
1934 | /* OK, set up the swap map and apply the bad block list */ | 1979 | /* OK, set up the swap map and apply the bad block list */ |
1935 | swap_map = vmalloc(maxpages * sizeof(short)); | 1980 | swap_map = vmalloc(maxpages); |
1936 | if (!swap_map) { | 1981 | if (!swap_map) { |
1937 | error = -ENOMEM; | 1982 | error = -ENOMEM; |
1938 | goto bad_swap; | 1983 | goto bad_swap; |
1939 | } | 1984 | } |
1940 | 1985 | ||
1941 | memset(swap_map, 0, maxpages * sizeof(short)); | 1986 | memset(swap_map, 0, maxpages); |
1987 | nr_good_pages = maxpages - 1; /* omit header page */ | ||
1988 | |||
1942 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1989 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1943 | int page_nr = swap_header->info.badpages[i]; | 1990 | unsigned int page_nr = swap_header->info.badpages[i]; |
1944 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1991 | if (page_nr == 0 || page_nr > swap_header->info.last_page) { |
1945 | error = -EINVAL; | 1992 | error = -EINVAL; |
1946 | goto bad_swap; | 1993 | goto bad_swap; |
1947 | } | 1994 | } |
1948 | swap_map[page_nr] = SWAP_MAP_BAD; | 1995 | if (page_nr < maxpages) { |
1996 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1997 | nr_good_pages--; | ||
1998 | } | ||
1949 | } | 1999 | } |
1950 | 2000 | ||
1951 | error = swap_cgroup_swapon(type, maxpages); | 2001 | error = swap_cgroup_swapon(type, maxpages); |
1952 | if (error) | 2002 | if (error) |
1953 | goto bad_swap; | 2003 | goto bad_swap; |
1954 | 2004 | ||
1955 | nr_good_pages = swap_header->info.last_page - | ||
1956 | swap_header->info.nr_badpages - | ||
1957 | 1 /* header page */; | ||
1958 | |||
1959 | if (nr_good_pages) { | 2005 | if (nr_good_pages) { |
1960 | swap_map[0] = SWAP_MAP_BAD; | 2006 | swap_map[0] = SWAP_MAP_BAD; |
1961 | p->max = maxpages; | 2007 | p->max = maxpages; |
@@ -2003,18 +2049,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2003 | 2049 | ||
2004 | /* insert swap space into swap_list: */ | 2050 | /* insert swap space into swap_list: */ |
2005 | prev = -1; | 2051 | prev = -1; |
2006 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 2052 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
2007 | if (p->prio >= swap_info[i].prio) { | 2053 | if (p->prio >= swap_info[i]->prio) |
2008 | break; | 2054 | break; |
2009 | } | ||
2010 | prev = i; | 2055 | prev = i; |
2011 | } | 2056 | } |
2012 | p->next = i; | 2057 | p->next = i; |
2013 | if (prev < 0) { | 2058 | if (prev < 0) |
2014 | swap_list.head = swap_list.next = p - swap_info; | 2059 | swap_list.head = swap_list.next = type; |
2015 | } else { | 2060 | else |
2016 | swap_info[prev].next = p - swap_info; | 2061 | swap_info[prev]->next = type; |
2017 | } | ||
2018 | spin_unlock(&swap_lock); | 2062 | spin_unlock(&swap_lock); |
2019 | mutex_unlock(&swapon_mutex); | 2063 | mutex_unlock(&swapon_mutex); |
2020 | error = 0; | 2064 | error = 0; |
@@ -2051,15 +2095,15 @@ out: | |||
2051 | 2095 | ||
2052 | void si_swapinfo(struct sysinfo *val) | 2096 | void si_swapinfo(struct sysinfo *val) |
2053 | { | 2097 | { |
2054 | unsigned int i; | 2098 | unsigned int type; |
2055 | unsigned long nr_to_be_unused = 0; | 2099 | unsigned long nr_to_be_unused = 0; |
2056 | 2100 | ||
2057 | spin_lock(&swap_lock); | 2101 | spin_lock(&swap_lock); |
2058 | for (i = 0; i < nr_swapfiles; i++) { | 2102 | for (type = 0; type < nr_swapfiles; type++) { |
2059 | if (!(swap_info[i].flags & SWP_USED) || | 2103 | struct swap_info_struct *si = swap_info[type]; |
2060 | (swap_info[i].flags & SWP_WRITEOK)) | 2104 | |
2061 | continue; | 2105 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2062 | nr_to_be_unused += swap_info[i].inuse_pages; | 2106 | nr_to_be_unused += si->inuse_pages; |
2063 | } | 2107 | } |
2064 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2108 | val->freeswap = nr_swap_pages + nr_to_be_unused; |
2065 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2109 | val->totalswap = total_swap_pages + nr_to_be_unused; |
@@ -2069,101 +2113,111 @@ void si_swapinfo(struct sysinfo *val) | |||
2069 | /* | 2113 | /* |
2070 | * Verify that a swap entry is valid and increment its swap map count. | 2114 | * Verify that a swap entry is valid and increment its swap map count. |
2071 | * | 2115 | * |
2072 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
2073 | * "permanent", but will be reclaimed by the next swapoff. | ||
2074 | * Returns error code in following case. | 2116 | * Returns error code in following case. |
2075 | * - success -> 0 | 2117 | * - success -> 0 |
2076 | * - swp_entry is invalid -> EINVAL | 2118 | * - swp_entry is invalid -> EINVAL |
2077 | * - swp_entry is migration entry -> EINVAL | 2119 | * - swp_entry is migration entry -> EINVAL |
2078 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2120 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2079 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2121 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2122 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
2080 | */ | 2123 | */ |
2081 | static int __swap_duplicate(swp_entry_t entry, bool cache) | 2124 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
2082 | { | 2125 | { |
2083 | struct swap_info_struct * p; | 2126 | struct swap_info_struct *p; |
2084 | unsigned long offset, type; | 2127 | unsigned long offset, type; |
2085 | int result = -EINVAL; | 2128 | unsigned char count; |
2086 | int count; | 2129 | unsigned char has_cache; |
2087 | bool has_cache; | 2130 | int err = -EINVAL; |
2088 | 2131 | ||
2089 | if (non_swap_entry(entry)) | 2132 | if (non_swap_entry(entry)) |
2090 | return -EINVAL; | 2133 | goto out; |
2091 | 2134 | ||
2092 | type = swp_type(entry); | 2135 | type = swp_type(entry); |
2093 | if (type >= nr_swapfiles) | 2136 | if (type >= nr_swapfiles) |
2094 | goto bad_file; | 2137 | goto bad_file; |
2095 | p = type + swap_info; | 2138 | p = swap_info[type]; |
2096 | offset = swp_offset(entry); | 2139 | offset = swp_offset(entry); |
2097 | 2140 | ||
2098 | spin_lock(&swap_lock); | 2141 | spin_lock(&swap_lock); |
2099 | |||
2100 | if (unlikely(offset >= p->max)) | 2142 | if (unlikely(offset >= p->max)) |
2101 | goto unlock_out; | 2143 | goto unlock_out; |
2102 | 2144 | ||
2103 | count = swap_count(p->swap_map[offset]); | 2145 | count = p->swap_map[offset]; |
2104 | has_cache = swap_has_cache(p->swap_map[offset]); | 2146 | has_cache = count & SWAP_HAS_CACHE; |
2147 | count &= ~SWAP_HAS_CACHE; | ||
2148 | err = 0; | ||
2105 | 2149 | ||
2106 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | 2150 | if (usage == SWAP_HAS_CACHE) { |
2107 | 2151 | ||
2108 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | 2152 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
2109 | if (!has_cache && count) { | 2153 | if (!has_cache && count) |
2110 | p->swap_map[offset] = encode_swapmap(count, true); | 2154 | has_cache = SWAP_HAS_CACHE; |
2111 | result = 0; | 2155 | else if (has_cache) /* someone else added cache */ |
2112 | } else if (has_cache) /* someone added cache */ | 2156 | err = -EEXIST; |
2113 | result = -EEXIST; | 2157 | else /* no users remaining */ |
2114 | else if (!count) /* no users */ | 2158 | err = -ENOENT; |
2115 | result = -ENOENT; | ||
2116 | 2159 | ||
2117 | } else if (count || has_cache) { | 2160 | } else if (count || has_cache) { |
2118 | if (count < SWAP_MAP_MAX - 1) { | 2161 | |
2119 | p->swap_map[offset] = encode_swapmap(count + 1, | 2162 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
2120 | has_cache); | 2163 | count += usage; |
2121 | result = 0; | 2164 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
2122 | } else if (count <= SWAP_MAP_MAX) { | 2165 | err = -EINVAL; |
2123 | if (swap_overflow++ < 5) | 2166 | else if (swap_count_continued(p, offset, count)) |
2124 | printk(KERN_WARNING | 2167 | count = COUNT_CONTINUED; |
2125 | "swap_dup: swap entry overflow\n"); | 2168 | else |
2126 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, | 2169 | err = -ENOMEM; |
2127 | has_cache); | ||
2128 | result = 0; | ||
2129 | } | ||
2130 | } else | 2170 | } else |
2131 | result = -ENOENT; /* unused swap entry */ | 2171 | err = -ENOENT; /* unused swap entry */ |
2172 | |||
2173 | p->swap_map[offset] = count | has_cache; | ||
2174 | |||
2132 | unlock_out: | 2175 | unlock_out: |
2133 | spin_unlock(&swap_lock); | 2176 | spin_unlock(&swap_lock); |
2134 | out: | 2177 | out: |
2135 | return result; | 2178 | return err; |
2136 | 2179 | ||
2137 | bad_file: | 2180 | bad_file: |
2138 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2181 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
2139 | goto out; | 2182 | goto out; |
2140 | } | 2183 | } |
2184 | |||
2185 | /* | ||
2186 | * Help swapoff by noting that swap entry belongs to shmem/tmpfs | ||
2187 | * (in which case its reference count is never incremented). | ||
2188 | */ | ||
2189 | void swap_shmem_alloc(swp_entry_t entry) | ||
2190 | { | ||
2191 | __swap_duplicate(entry, SWAP_MAP_SHMEM); | ||
2192 | } | ||
2193 | |||
2141 | /* | 2194 | /* |
2142 | * increase reference count of swap entry by 1. | 2195 | * Increase reference count of swap entry by 1. |
2196 | * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required | ||
2197 | * but could not be atomically allocated. Returns 0, just as if it succeeded, | ||
2198 | * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which | ||
2199 | * might occur if a page table entry has got corrupted. | ||
2143 | */ | 2200 | */ |
2144 | void swap_duplicate(swp_entry_t entry) | 2201 | int swap_duplicate(swp_entry_t entry) |
2145 | { | 2202 | { |
2146 | __swap_duplicate(entry, SWAP_MAP); | 2203 | int err = 0; |
2204 | |||
2205 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
2206 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
2207 | return err; | ||
2147 | } | 2208 | } |
2148 | 2209 | ||
2149 | /* | 2210 | /* |
2150 | * @entry: swap entry for which we allocate swap cache. | 2211 | * @entry: swap entry for which we allocate swap cache. |
2151 | * | 2212 | * |
2152 | * Called when allocating swap cache for exising swap entry, | 2213 | * Called when allocating swap cache for existing swap entry, |
2153 | * This can return error codes. Returns 0 at success. | 2214 | * This can return error codes. Returns 0 at success. |
2154 | * -EBUSY means there is a swap cache. | 2215 | * -EBUSY means there is a swap cache. |
2155 | * Note: return code is different from swap_duplicate(). | 2216 | * Note: return code is different from swap_duplicate(). |
2156 | */ | 2217 | */ |
2157 | int swapcache_prepare(swp_entry_t entry) | 2218 | int swapcache_prepare(swp_entry_t entry) |
2158 | { | 2219 | { |
2159 | return __swap_duplicate(entry, SWAP_CACHE); | 2220 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2160 | } | ||
2161 | |||
2162 | |||
2163 | struct swap_info_struct * | ||
2164 | get_swap_info_struct(unsigned type) | ||
2165 | { | ||
2166 | return &swap_info[type]; | ||
2167 | } | 2221 | } |
2168 | 2222 | ||
2169 | /* | 2223 | /* |
@@ -2181,7 +2235,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2181 | if (!our_page_cluster) /* no readahead */ | 2235 | if (!our_page_cluster) /* no readahead */ |
2182 | return 0; | 2236 | return 0; |
2183 | 2237 | ||
2184 | si = &swap_info[swp_type(entry)]; | 2238 | si = swap_info[swp_type(entry)]; |
2185 | target = swp_offset(entry); | 2239 | target = swp_offset(entry); |
2186 | base = (target >> our_page_cluster) << our_page_cluster; | 2240 | base = (target >> our_page_cluster) << our_page_cluster; |
2187 | end = base + (1 << our_page_cluster); | 2241 | end = base + (1 << our_page_cluster); |
@@ -2217,3 +2271,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2217 | *offset = ++toff; | 2271 | *offset = ++toff; |
2218 | return nr_pages? ++nr_pages: 0; | 2272 | return nr_pages? ++nr_pages: 0; |
2219 | } | 2273 | } |
2274 | |||
2275 | /* | ||
2276 | * add_swap_count_continuation - called when a swap count is duplicated | ||
2277 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
2278 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
2279 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
2280 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
2281 | * | ||
2282 | * These continuation pages are seldom referenced: the common paths all work | ||
2283 | * on the original swap_map, only referring to a continuation page when the | ||
2284 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
2285 | * | ||
2286 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
2287 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
2288 | * can be called after dropping locks. | ||
2289 | */ | ||
2290 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
2291 | { | ||
2292 | struct swap_info_struct *si; | ||
2293 | struct page *head; | ||
2294 | struct page *page; | ||
2295 | struct page *list_page; | ||
2296 | pgoff_t offset; | ||
2297 | unsigned char count; | ||
2298 | |||
2299 | /* | ||
2300 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
2301 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
2302 | */ | ||
2303 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
2304 | |||
2305 | si = swap_info_get(entry); | ||
2306 | if (!si) { | ||
2307 | /* | ||
2308 | * An acceptable race has occurred since the failing | ||
2309 | * __swap_duplicate(): the swap entry has been freed, | ||
2310 | * perhaps even the whole swap_map cleared for swapoff. | ||
2311 | */ | ||
2312 | goto outer; | ||
2313 | } | ||
2314 | |||
2315 | offset = swp_offset(entry); | ||
2316 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
2317 | |||
2318 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
2319 | /* | ||
2320 | * The higher the swap count, the more likely it is that tasks | ||
2321 | * will race to add swap count continuation: we need to avoid | ||
2322 | * over-provisioning. | ||
2323 | */ | ||
2324 | goto out; | ||
2325 | } | ||
2326 | |||
2327 | if (!page) { | ||
2328 | spin_unlock(&swap_lock); | ||
2329 | return -ENOMEM; | ||
2330 | } | ||
2331 | |||
2332 | /* | ||
2333 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
2334 | * no architecture is using highmem pages for kernel pagetables: so it | ||
2335 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
2336 | */ | ||
2337 | head = vmalloc_to_page(si->swap_map + offset); | ||
2338 | offset &= ~PAGE_MASK; | ||
2339 | |||
2340 | /* | ||
2341 | * Page allocation does not initialize the page's lru field, | ||
2342 | * but it does always reset its private field. | ||
2343 | */ | ||
2344 | if (!page_private(head)) { | ||
2345 | BUG_ON(count & COUNT_CONTINUED); | ||
2346 | INIT_LIST_HEAD(&head->lru); | ||
2347 | set_page_private(head, SWP_CONTINUED); | ||
2348 | si->flags |= SWP_CONTINUED; | ||
2349 | } | ||
2350 | |||
2351 | list_for_each_entry(list_page, &head->lru, lru) { | ||
2352 | unsigned char *map; | ||
2353 | |||
2354 | /* | ||
2355 | * If the previous map said no continuation, but we've found | ||
2356 | * a continuation page, free our allocation and use this one. | ||
2357 | */ | ||
2358 | if (!(count & COUNT_CONTINUED)) | ||
2359 | goto out; | ||
2360 | |||
2361 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
2362 | count = *map; | ||
2363 | kunmap_atomic(map, KM_USER0); | ||
2364 | |||
2365 | /* | ||
2366 | * If this continuation count now has some space in it, | ||
2367 | * free our allocation and use this one. | ||
2368 | */ | ||
2369 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
2370 | goto out; | ||
2371 | } | ||
2372 | |||
2373 | list_add_tail(&page->lru, &head->lru); | ||
2374 | page = NULL; /* now it's attached, don't free it */ | ||
2375 | out: | ||
2376 | spin_unlock(&swap_lock); | ||
2377 | outer: | ||
2378 | if (page) | ||
2379 | __free_page(page); | ||
2380 | return 0; | ||
2381 | } | ||
2382 | |||
2383 | /* | ||
2384 | * swap_count_continued - when the original swap_map count is incremented | ||
2385 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
2386 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
2387 | * when the original swap_map count is decremented from 0 with continuation, | ||
2388 | * borrow from the continuation and report whether it still holds more. | ||
2389 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
2390 | */ | ||
2391 | static bool swap_count_continued(struct swap_info_struct *si, | ||
2392 | pgoff_t offset, unsigned char count) | ||
2393 | { | ||
2394 | struct page *head; | ||
2395 | struct page *page; | ||
2396 | unsigned char *map; | ||
2397 | |||
2398 | head = vmalloc_to_page(si->swap_map + offset); | ||
2399 | if (page_private(head) != SWP_CONTINUED) { | ||
2400 | BUG_ON(count & COUNT_CONTINUED); | ||
2401 | return false; /* need to add count continuation */ | ||
2402 | } | ||
2403 | |||
2404 | offset &= ~PAGE_MASK; | ||
2405 | page = list_entry(head->lru.next, struct page, lru); | ||
2406 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2407 | |||
2408 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
2409 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
2410 | |||
2411 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
2412 | /* | ||
2413 | * Think of how you add 1 to 999 | ||
2414 | */ | ||
2415 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
2416 | kunmap_atomic(map, KM_USER0); | ||
2417 | page = list_entry(page->lru.next, struct page, lru); | ||
2418 | BUG_ON(page == head); | ||
2419 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2420 | } | ||
2421 | if (*map == SWAP_CONT_MAX) { | ||
2422 | kunmap_atomic(map, KM_USER0); | ||
2423 | page = list_entry(page->lru.next, struct page, lru); | ||
2424 | if (page == head) | ||
2425 | return false; /* add count continuation */ | ||
2426 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2427 | init_map: *map = 0; /* we didn't zero the page */ | ||
2428 | } | ||
2429 | *map += 1; | ||
2430 | kunmap_atomic(map, KM_USER0); | ||
2431 | page = list_entry(page->lru.prev, struct page, lru); | ||
2432 | while (page != head) { | ||
2433 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2434 | *map = COUNT_CONTINUED; | ||
2435 | kunmap_atomic(map, KM_USER0); | ||
2436 | page = list_entry(page->lru.prev, struct page, lru); | ||
2437 | } | ||
2438 | return true; /* incremented */ | ||
2439 | |||
2440 | } else { /* decrementing */ | ||
2441 | /* | ||
2442 | * Think of how you subtract 1 from 1000 | ||
2443 | */ | ||
2444 | BUG_ON(count != COUNT_CONTINUED); | ||
2445 | while (*map == COUNT_CONTINUED) { | ||
2446 | kunmap_atomic(map, KM_USER0); | ||
2447 | page = list_entry(page->lru.next, struct page, lru); | ||
2448 | BUG_ON(page == head); | ||
2449 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2450 | } | ||
2451 | BUG_ON(*map == 0); | ||
2452 | *map -= 1; | ||
2453 | if (*map == 0) | ||
2454 | count = 0; | ||
2455 | kunmap_atomic(map, KM_USER0); | ||
2456 | page = list_entry(page->lru.prev, struct page, lru); | ||
2457 | while (page != head) { | ||
2458 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2459 | *map = SWAP_CONT_MAX | count; | ||
2460 | count = COUNT_CONTINUED; | ||
2461 | kunmap_atomic(map, KM_USER0); | ||
2462 | page = list_entry(page->lru.prev, struct page, lru); | ||
2463 | } | ||
2464 | return count == COUNT_CONTINUED; | ||
2465 | } | ||
2466 | } | ||
2467 | |||
2468 | /* | ||
2469 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
2470 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
2471 | */ | ||
2472 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
2473 | { | ||
2474 | pgoff_t offset; | ||
2475 | |||
2476 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
2477 | struct page *head; | ||
2478 | head = vmalloc_to_page(si->swap_map + offset); | ||
2479 | if (page_private(head)) { | ||
2480 | struct list_head *this, *next; | ||
2481 | list_for_each_safe(this, next, &head->lru) { | ||
2482 | struct page *page; | ||
2483 | page = list_entry(this, struct page, lru); | ||
2484 | list_del(this); | ||
2485 | __free_page(page); | ||
2486 | } | ||
2487 | } | ||
2488 | } | ||
2489 | } | ||