aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c596
1 files changed, 464 insertions, 132 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cf2e60983b7..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si,
175 } 175 }
176} 176}
177 177
178static int wait_for_discard(void *word) 178#define SWAPFILE_CLUSTER 256
179#define LATENCY_LIMIT 256
180
181static inline void cluster_set_flag(struct swap_cluster_info *info,
182 unsigned int flag)
179{ 183{
180 schedule(); 184 info->flags = flag;
181 return 0;
182} 185}
183 186
184#define SWAPFILE_CLUSTER 256 187static inline unsigned int cluster_count(struct swap_cluster_info *info)
185#define LATENCY_LIMIT 256 188{
189 return info->data;
190}
191
192static inline void cluster_set_count(struct swap_cluster_info *info,
193 unsigned int c)
194{
195 info->data = c;
196}
197
198static inline void cluster_set_count_flag(struct swap_cluster_info *info,
199 unsigned int c, unsigned int f)
200{
201 info->flags = f;
202 info->data = c;
203}
204
205static inline unsigned int cluster_next(struct swap_cluster_info *info)
206{
207 return info->data;
208}
209
210static inline void cluster_set_next(struct swap_cluster_info *info,
211 unsigned int n)
212{
213 info->data = n;
214}
215
216static inline void cluster_set_next_flag(struct swap_cluster_info *info,
217 unsigned int n, unsigned int f)
218{
219 info->flags = f;
220 info->data = n;
221}
222
223static inline bool cluster_is_free(struct swap_cluster_info *info)
224{
225 return info->flags & CLUSTER_FLAG_FREE;
226}
227
228static inline bool cluster_is_null(struct swap_cluster_info *info)
229{
230 return info->flags & CLUSTER_FLAG_NEXT_NULL;
231}
232
233static inline void cluster_set_null(struct swap_cluster_info *info)
234{
235 info->flags = CLUSTER_FLAG_NEXT_NULL;
236 info->data = 0;
237}
238
239/* Add a cluster to discard list and schedule it to do discard */
240static void swap_cluster_schedule_discard(struct swap_info_struct *si,
241 unsigned int idx)
242{
243 /*
244 * If scan_swap_map() can't find a free cluster, it will check
245 * si->swap_map directly. To make sure the discarding cluster isn't
246 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
247 * will be cleared after discard
248 */
249 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
250 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
251
252 if (cluster_is_null(&si->discard_cluster_head)) {
253 cluster_set_next_flag(&si->discard_cluster_head,
254 idx, 0);
255 cluster_set_next_flag(&si->discard_cluster_tail,
256 idx, 0);
257 } else {
258 unsigned int tail = cluster_next(&si->discard_cluster_tail);
259 cluster_set_next(&si->cluster_info[tail], idx);
260 cluster_set_next_flag(&si->discard_cluster_tail,
261 idx, 0);
262 }
263
264 schedule_work(&si->discard_work);
265}
266
267/*
268 * Doing discard actually. After a cluster discard is finished, the cluster
269 * will be added to free cluster list. caller should hold si->lock.
270*/
271static void swap_do_scheduled_discard(struct swap_info_struct *si)
272{
273 struct swap_cluster_info *info;
274 unsigned int idx;
275
276 info = si->cluster_info;
277
278 while (!cluster_is_null(&si->discard_cluster_head)) {
279 idx = cluster_next(&si->discard_cluster_head);
280
281 cluster_set_next_flag(&si->discard_cluster_head,
282 cluster_next(&info[idx]), 0);
283 if (cluster_next(&si->discard_cluster_tail) == idx) {
284 cluster_set_null(&si->discard_cluster_head);
285 cluster_set_null(&si->discard_cluster_tail);
286 }
287 spin_unlock(&si->lock);
288
289 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
290 SWAPFILE_CLUSTER);
291
292 spin_lock(&si->lock);
293 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
294 if (cluster_is_null(&si->free_cluster_head)) {
295 cluster_set_next_flag(&si->free_cluster_head,
296 idx, 0);
297 cluster_set_next_flag(&si->free_cluster_tail,
298 idx, 0);
299 } else {
300 unsigned int tail;
301
302 tail = cluster_next(&si->free_cluster_tail);
303 cluster_set_next(&info[tail], idx);
304 cluster_set_next_flag(&si->free_cluster_tail,
305 idx, 0);
306 }
307 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
308 0, SWAPFILE_CLUSTER);
309 }
310}
311
312static void swap_discard_work(struct work_struct *work)
313{
314 struct swap_info_struct *si;
315
316 si = container_of(work, struct swap_info_struct, discard_work);
317
318 spin_lock(&si->lock);
319 swap_do_scheduled_discard(si);
320 spin_unlock(&si->lock);
321}
322
323/*
324 * The cluster corresponding to page_nr will be used. The cluster will be
325 * removed from free cluster list and its usage counter will be increased.
326 */
327static void inc_cluster_info_page(struct swap_info_struct *p,
328 struct swap_cluster_info *cluster_info, unsigned long page_nr)
329{
330 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
331
332 if (!cluster_info)
333 return;
334 if (cluster_is_free(&cluster_info[idx])) {
335 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
336 cluster_set_next_flag(&p->free_cluster_head,
337 cluster_next(&cluster_info[idx]), 0);
338 if (cluster_next(&p->free_cluster_tail) == idx) {
339 cluster_set_null(&p->free_cluster_tail);
340 cluster_set_null(&p->free_cluster_head);
341 }
342 cluster_set_count_flag(&cluster_info[idx], 0, 0);
343 }
344
345 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
346 cluster_set_count(&cluster_info[idx],
347 cluster_count(&cluster_info[idx]) + 1);
348}
349
350/*
351 * The cluster corresponding to page_nr decreases one usage. If the usage
352 * counter becomes 0, which means no page in the cluster is in using, we can
353 * optionally discard the cluster and add it to free cluster list.
354 */
355static void dec_cluster_info_page(struct swap_info_struct *p,
356 struct swap_cluster_info *cluster_info, unsigned long page_nr)
357{
358 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
359
360 if (!cluster_info)
361 return;
362
363 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
364 cluster_set_count(&cluster_info[idx],
365 cluster_count(&cluster_info[idx]) - 1);
366
367 if (cluster_count(&cluster_info[idx]) == 0) {
368 /*
369 * If the swap is discardable, prepare discard the cluster
370 * instead of free it immediately. The cluster will be freed
371 * after discard.
372 */
373 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
374 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
375 swap_cluster_schedule_discard(p, idx);
376 return;
377 }
378
379 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
380 if (cluster_is_null(&p->free_cluster_head)) {
381 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
382 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
383 } else {
384 unsigned int tail = cluster_next(&p->free_cluster_tail);
385 cluster_set_next(&cluster_info[tail], idx);
386 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
387 }
388 }
389}
390
391/*
392 * It's possible scan_swap_map() uses a free cluster in the middle of free
393 * cluster list. Avoiding such abuse to avoid list corruption.
394 */
395static bool
396scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
397 unsigned long offset)
398{
399 struct percpu_cluster *percpu_cluster;
400 bool conflict;
401
402 offset /= SWAPFILE_CLUSTER;
403 conflict = !cluster_is_null(&si->free_cluster_head) &&
404 offset != cluster_next(&si->free_cluster_head) &&
405 cluster_is_free(&si->cluster_info[offset]);
406
407 if (!conflict)
408 return false;
409
410 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
411 cluster_set_null(&percpu_cluster->index);
412 return true;
413}
414
415/*
416 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
417 * might involve allocating a new cluster for current CPU too.
418 */
419static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
420 unsigned long *offset, unsigned long *scan_base)
421{
422 struct percpu_cluster *cluster;
423 bool found_free;
424 unsigned long tmp;
425
426new_cluster:
427 cluster = this_cpu_ptr(si->percpu_cluster);
428 if (cluster_is_null(&cluster->index)) {
429 if (!cluster_is_null(&si->free_cluster_head)) {
430 cluster->index = si->free_cluster_head;
431 cluster->next = cluster_next(&cluster->index) *
432 SWAPFILE_CLUSTER;
433 } else if (!cluster_is_null(&si->discard_cluster_head)) {
434 /*
435 * we don't have free cluster but have some clusters in
436 * discarding, do discard now and reclaim them
437 */
438 swap_do_scheduled_discard(si);
439 *scan_base = *offset = si->cluster_next;
440 goto new_cluster;
441 } else
442 return;
443 }
444
445 found_free = false;
446
447 /*
448 * Other CPUs can use our cluster if they can't find a free cluster,
449 * check if there is still free entry in the cluster
450 */
451 tmp = cluster->next;
452 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
453 SWAPFILE_CLUSTER) {
454 if (!si->swap_map[tmp]) {
455 found_free = true;
456 break;
457 }
458 tmp++;
459 }
460 if (!found_free) {
461 cluster_set_null(&cluster->index);
462 goto new_cluster;
463 }
464 cluster->next = tmp + 1;
465 *offset = tmp;
466 *scan_base = tmp;
467}
186 468
187static unsigned long scan_swap_map(struct swap_info_struct *si, 469static unsigned long scan_swap_map(struct swap_info_struct *si,
188 unsigned char usage) 470 unsigned char usage)
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
191 unsigned long scan_base; 473 unsigned long scan_base;
192 unsigned long last_in_cluster = 0; 474 unsigned long last_in_cluster = 0;
193 int latency_ration = LATENCY_LIMIT; 475 int latency_ration = LATENCY_LIMIT;
194 int found_free_cluster = 0;
195 476
196 /* 477 /*
197 * We try to cluster swap pages by allocating them sequentially 478 * We try to cluster swap pages by allocating them sequentially
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
207 si->flags += SWP_SCANNING; 488 si->flags += SWP_SCANNING;
208 scan_base = offset = si->cluster_next; 489 scan_base = offset = si->cluster_next;
209 490
491 /* SSD algorithm */
492 if (si->cluster_info) {
493 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
494 goto checks;
495 }
496
210 if (unlikely(!si->cluster_nr--)) { 497 if (unlikely(!si->cluster_nr--)) {
211 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 498 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
212 si->cluster_nr = SWAPFILE_CLUSTER - 1; 499 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks; 500 goto checks;
214 } 501 }
215 if (si->flags & SWP_PAGE_DISCARD) { 502
216 /*
217 * Start range check on racing allocations, in case
218 * they overlap the cluster we eventually decide on
219 * (we scan without swap_lock to allow preemption).
220 * It's hardly conceivable that cluster_nr could be
221 * wrapped during our scan, but don't depend on it.
222 */
223 if (si->lowest_alloc)
224 goto checks;
225 si->lowest_alloc = si->max;
226 si->highest_alloc = 0;
227 }
228 spin_unlock(&si->lock); 503 spin_unlock(&si->lock);
229 504
230 /* 505 /*
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
248 offset -= SWAPFILE_CLUSTER - 1; 523 offset -= SWAPFILE_CLUSTER - 1;
249 si->cluster_next = offset; 524 si->cluster_next = offset;
250 si->cluster_nr = SWAPFILE_CLUSTER - 1; 525 si->cluster_nr = SWAPFILE_CLUSTER - 1;
251 found_free_cluster = 1;
252 goto checks; 526 goto checks;
253 } 527 }
254 if (unlikely(--latency_ration < 0)) { 528 if (unlikely(--latency_ration < 0)) {
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
269 offset -= SWAPFILE_CLUSTER - 1; 543 offset -= SWAPFILE_CLUSTER - 1;
270 si->cluster_next = offset; 544 si->cluster_next = offset;
271 si->cluster_nr = SWAPFILE_CLUSTER - 1; 545 si->cluster_nr = SWAPFILE_CLUSTER - 1;
272 found_free_cluster = 1;
273 goto checks; 546 goto checks;
274 } 547 }
275 if (unlikely(--latency_ration < 0)) { 548 if (unlikely(--latency_ration < 0)) {
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
281 offset = scan_base; 554 offset = scan_base;
282 spin_lock(&si->lock); 555 spin_lock(&si->lock);
283 si->cluster_nr = SWAPFILE_CLUSTER - 1; 556 si->cluster_nr = SWAPFILE_CLUSTER - 1;
284 si->lowest_alloc = 0;
285 } 557 }
286 558
287checks: 559checks:
560 if (si->cluster_info) {
561 while (scan_swap_map_ssd_cluster_conflict(si, offset))
562 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
563 }
288 if (!(si->flags & SWP_WRITEOK)) 564 if (!(si->flags & SWP_WRITEOK))
289 goto no_page; 565 goto no_page;
290 if (!si->highest_bit) 566 if (!si->highest_bit)
@@ -317,62 +593,10 @@ checks:
317 si->highest_bit = 0; 593 si->highest_bit = 0;
318 } 594 }
319 si->swap_map[offset] = usage; 595 si->swap_map[offset] = usage;
596 inc_cluster_info_page(si, si->cluster_info, offset);
320 si->cluster_next = offset + 1; 597 si->cluster_next = offset + 1;
321 si->flags -= SWP_SCANNING; 598 si->flags -= SWP_SCANNING;
322 599
323 if (si->lowest_alloc) {
324 /*
325 * Only set when SWP_PAGE_DISCARD, and there's a scan
326 * for a free cluster in progress or just completed.
327 */
328 if (found_free_cluster) {
329 /*
330 * To optimize wear-levelling, discard the
331 * old data of the cluster, taking care not to
332 * discard any of its pages that have already
333 * been allocated by racing tasks (offset has
334 * already stepped over any at the beginning).
335 */
336 if (offset < si->highest_alloc &&
337 si->lowest_alloc <= last_in_cluster)
338 last_in_cluster = si->lowest_alloc - 1;
339 si->flags |= SWP_DISCARDING;
340 spin_unlock(&si->lock);
341
342 if (offset < last_in_cluster)
343 discard_swap_cluster(si, offset,
344 last_in_cluster - offset + 1);
345
346 spin_lock(&si->lock);
347 si->lowest_alloc = 0;
348 si->flags &= ~SWP_DISCARDING;
349
350 smp_mb(); /* wake_up_bit advises this */
351 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
352
353 } else if (si->flags & SWP_DISCARDING) {
354 /*
355 * Delay using pages allocated by racing tasks
356 * until the whole discard has been issued. We
357 * could defer that delay until swap_writepage,
358 * but it's easier to keep this self-contained.
359 */
360 spin_unlock(&si->lock);
361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
362 wait_for_discard, TASK_UNINTERRUPTIBLE);
363 spin_lock(&si->lock);
364 } else {
365 /*
366 * Note pages allocated by racing tasks while
367 * scan for a free cluster is in progress, so
368 * that its final discard can exclude them.
369 */
370 if (offset < si->lowest_alloc)
371 si->lowest_alloc = offset;
372 if (offset > si->highest_alloc)
373 si->highest_alloc = offset;
374 }
375 }
376 return offset; 600 return offset;
377 601
378scan: 602scan:
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
527 return p; 751 return p;
528 752
529bad_free: 753bad_free:
530 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 754 pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
531 goto out; 755 goto out;
532bad_offset: 756bad_offset:
533 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 757 pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
534 goto out; 758 goto out;
535bad_device: 759bad_device:
536 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 760 pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
537 goto out; 761 goto out;
538bad_nofile: 762bad_nofile:
539 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 763 pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
540out: 764out:
541 return NULL; 765 return NULL;
542} 766}
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
600 824
601 /* free if no reference */ 825 /* free if no reference */
602 if (!usage) { 826 if (!usage) {
827 dec_cluster_info_page(p, p->cluster_info, offset);
603 if (offset < p->lowest_bit) 828 if (offset < p->lowest_bit)
604 p->lowest_bit = offset; 829 p->lowest_bit = offset;
605 if (offset > p->highest_bit) 830 if (offset > p->highest_bit)
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1107 else 1332 else
1108 continue; 1333 continue;
1109 } 1334 }
1110 count = si->swap_map[i]; 1335 count = ACCESS_ONCE(si->swap_map[i]);
1111 if (count && swap_count(count) != SWAP_MAP_BAD) 1336 if (count && swap_count(count) != SWAP_MAP_BAD)
1112 break; 1337 break;
1113 } 1338 }
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
1127{ 1352{
1128 struct swap_info_struct *si = swap_info[type]; 1353 struct swap_info_struct *si = swap_info[type];
1129 struct mm_struct *start_mm; 1354 struct mm_struct *start_mm;
1130 unsigned char *swap_map; 1355 volatile unsigned char *swap_map; /* swap_map is accessed without
1356 * locking. Mark it as volatile
1357 * to prevent compiler doing
1358 * something odd.
1359 */
1131 unsigned char swcount; 1360 unsigned char swcount;
1132 struct page *page; 1361 struct page *page;
1133 swp_entry_t entry; 1362 swp_entry_t entry;
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
1178 * reused since sys_swapoff() already disabled 1407 * reused since sys_swapoff() already disabled
1179 * allocation from here, or alloc_page() failed. 1408 * allocation from here, or alloc_page() failed.
1180 */ 1409 */
1181 if (!*swap_map) 1410 swcount = *swap_map;
1411 /*
1412 * We don't hold lock here, so the swap entry could be
1413 * SWAP_MAP_BAD (when the cluster is discarding).
1414 * Instead of fail out, We can just skip the swap
1415 * entry because swapoff will wait for discarding
1416 * finish anyway.
1417 */
1418 if (!swcount || swcount == SWAP_MAP_BAD)
1182 continue; 1419 continue;
1183 retval = -ENOMEM; 1420 retval = -ENOMEM;
1184 break; 1421 break;
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1524} 1761}
1525 1762
1526static void _enable_swap_info(struct swap_info_struct *p, int prio, 1763static void _enable_swap_info(struct swap_info_struct *p, int prio,
1527 unsigned char *swap_map) 1764 unsigned char *swap_map,
1765 struct swap_cluster_info *cluster_info)
1528{ 1766{
1529 int i, prev; 1767 int i, prev;
1530 1768
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1533 else 1771 else
1534 p->prio = --least_priority; 1772 p->prio = --least_priority;
1535 p->swap_map = swap_map; 1773 p->swap_map = swap_map;
1774 p->cluster_info = cluster_info;
1536 p->flags |= SWP_WRITEOK; 1775 p->flags |= SWP_WRITEOK;
1537 atomic_long_add(p->pages, &nr_swap_pages); 1776 atomic_long_add(p->pages, &nr_swap_pages);
1538 total_swap_pages += p->pages; 1777 total_swap_pages += p->pages;
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1553 1792
1554static void enable_swap_info(struct swap_info_struct *p, int prio, 1793static void enable_swap_info(struct swap_info_struct *p, int prio,
1555 unsigned char *swap_map, 1794 unsigned char *swap_map,
1795 struct swap_cluster_info *cluster_info,
1556 unsigned long *frontswap_map) 1796 unsigned long *frontswap_map)
1557{ 1797{
1558 frontswap_init(p->type, frontswap_map); 1798 frontswap_init(p->type, frontswap_map);
1559 spin_lock(&swap_lock); 1799 spin_lock(&swap_lock);
1560 spin_lock(&p->lock); 1800 spin_lock(&p->lock);
1561 _enable_swap_info(p, prio, swap_map); 1801 _enable_swap_info(p, prio, swap_map, cluster_info);
1562 spin_unlock(&p->lock); 1802 spin_unlock(&p->lock);
1563 spin_unlock(&swap_lock); 1803 spin_unlock(&swap_lock);
1564} 1804}
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
1567{ 1807{
1568 spin_lock(&swap_lock); 1808 spin_lock(&swap_lock);
1569 spin_lock(&p->lock); 1809 spin_lock(&p->lock);
1570 _enable_swap_info(p, p->prio, p->swap_map); 1810 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
1571 spin_unlock(&p->lock); 1811 spin_unlock(&p->lock);
1572 spin_unlock(&swap_lock); 1812 spin_unlock(&swap_lock);
1573} 1813}
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1576{ 1816{
1577 struct swap_info_struct *p = NULL; 1817 struct swap_info_struct *p = NULL;
1578 unsigned char *swap_map; 1818 unsigned char *swap_map;
1819 struct swap_cluster_info *cluster_info;
1579 unsigned long *frontswap_map; 1820 unsigned long *frontswap_map;
1580 struct file *swap_file, *victim; 1821 struct file *swap_file, *victim;
1581 struct address_space *mapping; 1822 struct address_space *mapping;
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1651 goto out_dput; 1892 goto out_dput;
1652 } 1893 }
1653 1894
1895 flush_work(&p->discard_work);
1896
1654 destroy_swap_extents(p); 1897 destroy_swap_extents(p);
1655 if (p->flags & SWP_CONTINUED) 1898 if (p->flags & SWP_CONTINUED)
1656 free_swap_count_continuations(p); 1899 free_swap_count_continuations(p);
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1675 p->max = 0; 1918 p->max = 0;
1676 swap_map = p->swap_map; 1919 swap_map = p->swap_map;
1677 p->swap_map = NULL; 1920 p->swap_map = NULL;
1921 cluster_info = p->cluster_info;
1922 p->cluster_info = NULL;
1678 p->flags = 0; 1923 p->flags = 0;
1679 frontswap_map = frontswap_map_get(p); 1924 frontswap_map = frontswap_map_get(p);
1680 frontswap_map_set(p, NULL); 1925 frontswap_map_set(p, NULL);
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1682 spin_unlock(&swap_lock); 1927 spin_unlock(&swap_lock);
1683 frontswap_invalidate_area(type); 1928 frontswap_invalidate_area(type);
1684 mutex_unlock(&swapon_mutex); 1929 mutex_unlock(&swapon_mutex);
1930 free_percpu(p->percpu_cluster);
1931 p->percpu_cluster = NULL;
1685 vfree(swap_map); 1932 vfree(swap_map);
1933 vfree(cluster_info);
1686 vfree(frontswap_map); 1934 vfree(frontswap_map);
1687 /* Destroy swap account informatin */ 1935 /* Destroy swap account informatin */
1688 swap_cgroup_swapoff(type); 1936 swap_cgroup_swapoff(type);
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1926 int i; 2174 int i;
1927 unsigned long maxpages; 2175 unsigned long maxpages;
1928 unsigned long swapfilepages; 2176 unsigned long swapfilepages;
2177 unsigned long last_page;
1929 2178
1930 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 2179 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1931 printk(KERN_ERR "Unable to find swap-space signature\n"); 2180 pr_err("Unable to find swap-space signature\n");
1932 return 0; 2181 return 0;
1933 } 2182 }
1934 2183
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1942 } 2191 }
1943 /* Check the swap header's sub-version */ 2192 /* Check the swap header's sub-version */
1944 if (swap_header->info.version != 1) { 2193 if (swap_header->info.version != 1) {
1945 printk(KERN_WARNING 2194 pr_warn("Unable to handle swap header version %d\n",
1946 "Unable to handle swap header version %d\n", 2195 swap_header->info.version);
1947 swap_header->info.version);
1948 return 0; 2196 return 0;
1949 } 2197 }
1950 2198
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1968 */ 2216 */
1969 maxpages = swp_offset(pte_to_swp_entry( 2217 maxpages = swp_offset(pte_to_swp_entry(
1970 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 2218 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1971 if (maxpages > swap_header->info.last_page) { 2219 last_page = swap_header->info.last_page;
1972 maxpages = swap_header->info.last_page + 1; 2220 if (last_page > maxpages) {
2221 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2222 maxpages << (PAGE_SHIFT - 10),
2223 last_page << (PAGE_SHIFT - 10));
2224 }
2225 if (maxpages > last_page) {
2226 maxpages = last_page + 1;
1973 /* p->max is an unsigned int: don't overflow it */ 2227 /* p->max is an unsigned int: don't overflow it */
1974 if ((unsigned int)maxpages == 0) 2228 if ((unsigned int)maxpages == 0)
1975 maxpages = UINT_MAX; 2229 maxpages = UINT_MAX;
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1980 return 0; 2234 return 0;
1981 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 2235 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1982 if (swapfilepages && maxpages > swapfilepages) { 2236 if (swapfilepages && maxpages > swapfilepages) {
1983 printk(KERN_WARNING 2237 pr_warn("Swap area shorter than signature indicates\n");
1984 "Swap area shorter than signature indicates\n");
1985 return 0; 2238 return 0;
1986 } 2239 }
1987 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 2240 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1995static int setup_swap_map_and_extents(struct swap_info_struct *p, 2248static int setup_swap_map_and_extents(struct swap_info_struct *p,
1996 union swap_header *swap_header, 2249 union swap_header *swap_header,
1997 unsigned char *swap_map, 2250 unsigned char *swap_map,
2251 struct swap_cluster_info *cluster_info,
1998 unsigned long maxpages, 2252 unsigned long maxpages,
1999 sector_t *span) 2253 sector_t *span)
2000{ 2254{
2001 int i; 2255 int i;
2002 unsigned int nr_good_pages; 2256 unsigned int nr_good_pages;
2003 int nr_extents; 2257 int nr_extents;
2258 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2259 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
2004 2260
2005 nr_good_pages = maxpages - 1; /* omit header page */ 2261 nr_good_pages = maxpages - 1; /* omit header page */
2006 2262
2263 cluster_set_null(&p->free_cluster_head);
2264 cluster_set_null(&p->free_cluster_tail);
2265 cluster_set_null(&p->discard_cluster_head);
2266 cluster_set_null(&p->discard_cluster_tail);
2267
2007 for (i = 0; i < swap_header->info.nr_badpages; i++) { 2268 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2008 unsigned int page_nr = swap_header->info.badpages[i]; 2269 unsigned int page_nr = swap_header->info.badpages[i];
2009 if (page_nr == 0 || page_nr > swap_header->info.last_page) 2270 if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2011 if (page_nr < maxpages) { 2272 if (page_nr < maxpages) {
2012 swap_map[page_nr] = SWAP_MAP_BAD; 2273 swap_map[page_nr] = SWAP_MAP_BAD;
2013 nr_good_pages--; 2274 nr_good_pages--;
2275 /*
2276 * Haven't marked the cluster free yet, no list
2277 * operation involved
2278 */
2279 inc_cluster_info_page(p, cluster_info, page_nr);
2014 } 2280 }
2015 } 2281 }
2016 2282
2283 /* Haven't marked the cluster free yet, no list operation involved */
2284 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2285 inc_cluster_info_page(p, cluster_info, i);
2286
2017 if (nr_good_pages) { 2287 if (nr_good_pages) {
2018 swap_map[0] = SWAP_MAP_BAD; 2288 swap_map[0] = SWAP_MAP_BAD;
2289 /*
2290 * Not mark the cluster free yet, no list
2291 * operation involved
2292 */
2293 inc_cluster_info_page(p, cluster_info, 0);
2019 p->max = maxpages; 2294 p->max = maxpages;
2020 p->pages = nr_good_pages; 2295 p->pages = nr_good_pages;
2021 nr_extents = setup_swap_extents(p, span); 2296 nr_extents = setup_swap_extents(p, span);
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2024 nr_good_pages = p->pages; 2299 nr_good_pages = p->pages;
2025 } 2300 }
2026 if (!nr_good_pages) { 2301 if (!nr_good_pages) {
2027 printk(KERN_WARNING "Empty swap-file\n"); 2302 pr_warn("Empty swap-file\n");
2028 return -EINVAL; 2303 return -EINVAL;
2029 } 2304 }
2030 2305
2306 if (!cluster_info)
2307 return nr_extents;
2308
2309 for (i = 0; i < nr_clusters; i++) {
2310 if (!cluster_count(&cluster_info[idx])) {
2311 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2312 if (cluster_is_null(&p->free_cluster_head)) {
2313 cluster_set_next_flag(&p->free_cluster_head,
2314 idx, 0);
2315 cluster_set_next_flag(&p->free_cluster_tail,
2316 idx, 0);
2317 } else {
2318 unsigned int tail;
2319
2320 tail = cluster_next(&p->free_cluster_tail);
2321 cluster_set_next(&cluster_info[tail], idx);
2322 cluster_set_next_flag(&p->free_cluster_tail,
2323 idx, 0);
2324 }
2325 }
2326 idx++;
2327 if (idx == nr_clusters)
2328 idx = 0;
2329 }
2031 return nr_extents; 2330 return nr_extents;
2032} 2331}
2033 2332
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2059 sector_t span; 2358 sector_t span;
2060 unsigned long maxpages; 2359 unsigned long maxpages;
2061 unsigned char *swap_map = NULL; 2360 unsigned char *swap_map = NULL;
2361 struct swap_cluster_info *cluster_info = NULL;
2062 unsigned long *frontswap_map = NULL; 2362 unsigned long *frontswap_map = NULL;
2063 struct page *page = NULL; 2363 struct page *page = NULL;
2064 struct inode *inode = NULL; 2364 struct inode *inode = NULL;
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2073 if (IS_ERR(p)) 2373 if (IS_ERR(p))
2074 return PTR_ERR(p); 2374 return PTR_ERR(p);
2075 2375
2376 INIT_WORK(&p->discard_work, swap_discard_work);
2377
2076 name = getname(specialfile); 2378 name = getname(specialfile);
2077 if (IS_ERR(name)) { 2379 if (IS_ERR(name)) {
2078 error = PTR_ERR(name); 2380 error = PTR_ERR(name);
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2132 error = -ENOMEM; 2434 error = -ENOMEM;
2133 goto bad_swap; 2435 goto bad_swap;
2134 } 2436 }
2437 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2438 p->flags |= SWP_SOLIDSTATE;
2439 /*
2440 * select a random position to start with to help wear leveling
2441 * SSD
2442 */
2443 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2444
2445 cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
2446 SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2447 if (!cluster_info) {
2448 error = -ENOMEM;
2449 goto bad_swap;
2450 }
2451 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2452 if (!p->percpu_cluster) {
2453 error = -ENOMEM;
2454 goto bad_swap;
2455 }
2456 for_each_possible_cpu(i) {
2457 struct percpu_cluster *cluster;
2458 cluster = per_cpu_ptr(p->percpu_cluster, i);
2459 cluster_set_null(&cluster->index);
2460 }
2461 }
2135 2462
2136 error = swap_cgroup_swapon(p->type, maxpages); 2463 error = swap_cgroup_swapon(p->type, maxpages);
2137 if (error) 2464 if (error)
2138 goto bad_swap; 2465 goto bad_swap;
2139 2466
2140 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 2467 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2141 maxpages, &span); 2468 cluster_info, maxpages, &span);
2142 if (unlikely(nr_extents < 0)) { 2469 if (unlikely(nr_extents < 0)) {
2143 error = nr_extents; 2470 error = nr_extents;
2144 goto bad_swap; 2471 goto bad_swap;
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2147 if (frontswap_enabled) 2474 if (frontswap_enabled)
2148 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); 2475 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2149 2476
2150 if (p->bdev) { 2477 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2151 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2478 /*
2152 p->flags |= SWP_SOLIDSTATE; 2479 * When discard is enabled for swap with no particular
2153 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2480 * policy flagged, we set all swap discard flags here in
2154 } 2481 * order to sustain backward compatibility with older
2155 2482 * swapon(8) releases.
2156 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 2483 */
2157 /* 2484 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2158 * When discard is enabled for swap with no particular 2485 SWP_PAGE_DISCARD);
2159 * policy flagged, we set all swap discard flags here in
2160 * order to sustain backward compatibility with older
2161 * swapon(8) releases.
2162 */
2163 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2164 SWP_PAGE_DISCARD);
2165 2486
2166 /* 2487 /*
2167 * By flagging sys_swapon, a sysadmin can tell us to 2488 * By flagging sys_swapon, a sysadmin can tell us to
2168 * either do single-time area discards only, or to just 2489 * either do single-time area discards only, or to just
2169 * perform discards for released swap page-clusters. 2490 * perform discards for released swap page-clusters.
2170 * Now it's time to adjust the p->flags accordingly. 2491 * Now it's time to adjust the p->flags accordingly.
2171 */ 2492 */
2172 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 2493 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2173 p->flags &= ~SWP_PAGE_DISCARD; 2494 p->flags &= ~SWP_PAGE_DISCARD;
2174 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 2495 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2175 p->flags &= ~SWP_AREA_DISCARD; 2496 p->flags &= ~SWP_AREA_DISCARD;
2176 2497
2177 /* issue a swapon-time discard if it's still required */ 2498 /* issue a swapon-time discard if it's still required */
2178 if (p->flags & SWP_AREA_DISCARD) { 2499 if (p->flags & SWP_AREA_DISCARD) {
2179 int err = discard_swap(p); 2500 int err = discard_swap(p);
2180 if (unlikely(err)) 2501 if (unlikely(err))
2181 printk(KERN_ERR 2502 pr_err("swapon: discard_swap(%p): %d\n",
2182 "swapon: discard_swap(%p): %d\n", 2503 p, err);
2183 p, err);
2184 }
2185 } 2504 }
2186 } 2505 }
2187 2506
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2190 if (swap_flags & SWAP_FLAG_PREFER) 2509 if (swap_flags & SWAP_FLAG_PREFER)
2191 prio = 2510 prio =
2192 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2511 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2193 enable_swap_info(p, prio, swap_map, frontswap_map); 2512 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
2194 2513
2195 printk(KERN_INFO "Adding %uk swap on %s. " 2514 pr_info("Adding %uk swap on %s. "
2196 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 2515 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2197 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2516 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2198 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2517 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2211 error = 0; 2530 error = 0;
2212 goto out; 2531 goto out;
2213bad_swap: 2532bad_swap:
2533 free_percpu(p->percpu_cluster);
2534 p->percpu_cluster = NULL;
2214 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2535 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2215 set_blocksize(p->bdev, p->old_block_size); 2536 set_blocksize(p->bdev, p->old_block_size);
2216 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2537 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2222,6 +2543,7 @@ bad_swap:
2222 p->flags = 0; 2543 p->flags = 0;
2223 spin_unlock(&swap_lock); 2544 spin_unlock(&swap_lock);
2224 vfree(swap_map); 2545 vfree(swap_map);
2546 vfree(cluster_info);
2225 if (swap_file) { 2547 if (swap_file) {
2226 if (inode && S_ISREG(inode->i_mode)) { 2548 if (inode && S_ISREG(inode->i_mode)) {
2227 mutex_unlock(&inode->i_mutex); 2549 mutex_unlock(&inode->i_mutex);
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2291 goto unlock_out; 2613 goto unlock_out;
2292 2614
2293 count = p->swap_map[offset]; 2615 count = p->swap_map[offset];
2616
2617 /*
2618 * swapin_readahead() doesn't check if a swap entry is valid, so the
2619 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
2620 */
2621 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2622 err = -ENOENT;
2623 goto unlock_out;
2624 }
2625
2294 has_cache = count & SWAP_HAS_CACHE; 2626 has_cache = count & SWAP_HAS_CACHE;
2295 count &= ~SWAP_HAS_CACHE; 2627 count &= ~SWAP_HAS_CACHE;
2296 err = 0; 2628 err = 0;
@@ -2326,7 +2658,7 @@ out:
2326 return err; 2658 return err;
2327 2659
2328bad_file: 2660bad_file:
2329 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2661 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
2330 goto out; 2662 goto out;
2331} 2663}
2332 2664