diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-11 19:08:54 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-11 19:08:54 -0400 |
commit | c2d95729e3094ecdd8c54e856bbe971adbbd7f48 (patch) | |
tree | 76cc5b551227d3d55d68a93105c1fe8080dfb812 /mm/swapfile.c | |
parent | bbda1baeeb2f4aff3addac3d086a1e56c3f2503e (diff) | |
parent | b34081f1cd59585451efaa69e1dff1b9507e6c89 (diff) |
Merge branch 'akpm' (patches from Andrew Morton)
Merge first patch-bomb from Andrew Morton:
- Some pidns/fork/exec tweaks
- OCFS2 updates
- Most of MM - there remain quite a few memcg parts which depend on
pending core cgroups changes. Which might have been already merged -
I'll check tomorrow...
- Various misc stuff all over the place
- A few block bits which I never got around to sending to Jens -
relatively minor things.
- MAINTAINERS maintenance
- A small number of lib/ updates
- checkpatch updates
- epoll
- firmware/dmi-scan
- Some kprobes work for S390
- drivers/rtc updates
- hfsplus feature work
- vmcore feature work
- rbtree upgrades
- AOE updates
- pktcdvd cleanups
- PPS
- memstick
- w1
- New "inittmpfs" feature, which does the obvious
- More IPC work from Davidlohr.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (303 commits)
lz4: fix compression/decompression signedness mismatch
ipc: drop ipc_lock_check
ipc, shm: drop shm_lock_check
ipc: drop ipc_lock_by_ptr
ipc, shm: guard against non-existant vma in shmdt(2)
ipc: document general ipc locking scheme
ipc,msg: drop msg_unlock
ipc: rename ids->rw_mutex
ipc,shm: shorten critical region for shmat
ipc,shm: cleanup do_shmat pasta
ipc,shm: shorten critical region for shmctl
ipc,shm: make shmctl_nolock lockless
ipc,shm: introduce shmctl_nolock
ipc: drop ipcctl_pre_down
ipc,shm: shorten critical region in shmctl_down
ipc,shm: introduce lockless functions to obtain the ipc object
initmpfs: use initramfs if rootfstype= or root= specified
initmpfs: make rootfs use tmpfs when CONFIG_TMPFS enabled
initmpfs: move rootfs code from fs/ramfs/ to init/
initmpfs: move bdi setup from init_rootfs to init_ramfs
...
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 596 |
1 files changed, 464 insertions, 132 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cf2e60983b7..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | static int wait_for_discard(void *word) | 178 | #define SWAPFILE_CLUSTER 256 |
179 | #define LATENCY_LIMIT 256 | ||
180 | |||
181 | static inline void cluster_set_flag(struct swap_cluster_info *info, | ||
182 | unsigned int flag) | ||
179 | { | 183 | { |
180 | schedule(); | 184 | info->flags = flag; |
181 | return 0; | ||
182 | } | 185 | } |
183 | 186 | ||
184 | #define SWAPFILE_CLUSTER 256 | 187 | static inline unsigned int cluster_count(struct swap_cluster_info *info) |
185 | #define LATENCY_LIMIT 256 | 188 | { |
189 | return info->data; | ||
190 | } | ||
191 | |||
192 | static inline void cluster_set_count(struct swap_cluster_info *info, | ||
193 | unsigned int c) | ||
194 | { | ||
195 | info->data = c; | ||
196 | } | ||
197 | |||
198 | static inline void cluster_set_count_flag(struct swap_cluster_info *info, | ||
199 | unsigned int c, unsigned int f) | ||
200 | { | ||
201 | info->flags = f; | ||
202 | info->data = c; | ||
203 | } | ||
204 | |||
205 | static inline unsigned int cluster_next(struct swap_cluster_info *info) | ||
206 | { | ||
207 | return info->data; | ||
208 | } | ||
209 | |||
210 | static inline void cluster_set_next(struct swap_cluster_info *info, | ||
211 | unsigned int n) | ||
212 | { | ||
213 | info->data = n; | ||
214 | } | ||
215 | |||
216 | static inline void cluster_set_next_flag(struct swap_cluster_info *info, | ||
217 | unsigned int n, unsigned int f) | ||
218 | { | ||
219 | info->flags = f; | ||
220 | info->data = n; | ||
221 | } | ||
222 | |||
223 | static inline bool cluster_is_free(struct swap_cluster_info *info) | ||
224 | { | ||
225 | return info->flags & CLUSTER_FLAG_FREE; | ||
226 | } | ||
227 | |||
228 | static inline bool cluster_is_null(struct swap_cluster_info *info) | ||
229 | { | ||
230 | return info->flags & CLUSTER_FLAG_NEXT_NULL; | ||
231 | } | ||
232 | |||
233 | static inline void cluster_set_null(struct swap_cluster_info *info) | ||
234 | { | ||
235 | info->flags = CLUSTER_FLAG_NEXT_NULL; | ||
236 | info->data = 0; | ||
237 | } | ||
238 | |||
239 | /* Add a cluster to discard list and schedule it to do discard */ | ||
240 | static void swap_cluster_schedule_discard(struct swap_info_struct *si, | ||
241 | unsigned int idx) | ||
242 | { | ||
243 | /* | ||
244 | * If scan_swap_map() can't find a free cluster, it will check | ||
245 | * si->swap_map directly. To make sure the discarding cluster isn't | ||
246 | * taken by scan_swap_map(), mark the swap entries bad (occupied). It | ||
247 | * will be cleared after discard | ||
248 | */ | ||
249 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, | ||
250 | SWAP_MAP_BAD, SWAPFILE_CLUSTER); | ||
251 | |||
252 | if (cluster_is_null(&si->discard_cluster_head)) { | ||
253 | cluster_set_next_flag(&si->discard_cluster_head, | ||
254 | idx, 0); | ||
255 | cluster_set_next_flag(&si->discard_cluster_tail, | ||
256 | idx, 0); | ||
257 | } else { | ||
258 | unsigned int tail = cluster_next(&si->discard_cluster_tail); | ||
259 | cluster_set_next(&si->cluster_info[tail], idx); | ||
260 | cluster_set_next_flag(&si->discard_cluster_tail, | ||
261 | idx, 0); | ||
262 | } | ||
263 | |||
264 | schedule_work(&si->discard_work); | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Doing discard actually. After a cluster discard is finished, the cluster | ||
269 | * will be added to free cluster list. caller should hold si->lock. | ||
270 | */ | ||
271 | static void swap_do_scheduled_discard(struct swap_info_struct *si) | ||
272 | { | ||
273 | struct swap_cluster_info *info; | ||
274 | unsigned int idx; | ||
275 | |||
276 | info = si->cluster_info; | ||
277 | |||
278 | while (!cluster_is_null(&si->discard_cluster_head)) { | ||
279 | idx = cluster_next(&si->discard_cluster_head); | ||
280 | |||
281 | cluster_set_next_flag(&si->discard_cluster_head, | ||
282 | cluster_next(&info[idx]), 0); | ||
283 | if (cluster_next(&si->discard_cluster_tail) == idx) { | ||
284 | cluster_set_null(&si->discard_cluster_head); | ||
285 | cluster_set_null(&si->discard_cluster_tail); | ||
286 | } | ||
287 | spin_unlock(&si->lock); | ||
288 | |||
289 | discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, | ||
290 | SWAPFILE_CLUSTER); | ||
291 | |||
292 | spin_lock(&si->lock); | ||
293 | cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); | ||
294 | if (cluster_is_null(&si->free_cluster_head)) { | ||
295 | cluster_set_next_flag(&si->free_cluster_head, | ||
296 | idx, 0); | ||
297 | cluster_set_next_flag(&si->free_cluster_tail, | ||
298 | idx, 0); | ||
299 | } else { | ||
300 | unsigned int tail; | ||
301 | |||
302 | tail = cluster_next(&si->free_cluster_tail); | ||
303 | cluster_set_next(&info[tail], idx); | ||
304 | cluster_set_next_flag(&si->free_cluster_tail, | ||
305 | idx, 0); | ||
306 | } | ||
307 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, | ||
308 | 0, SWAPFILE_CLUSTER); | ||
309 | } | ||
310 | } | ||
311 | |||
312 | static void swap_discard_work(struct work_struct *work) | ||
313 | { | ||
314 | struct swap_info_struct *si; | ||
315 | |||
316 | si = container_of(work, struct swap_info_struct, discard_work); | ||
317 | |||
318 | spin_lock(&si->lock); | ||
319 | swap_do_scheduled_discard(si); | ||
320 | spin_unlock(&si->lock); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * The cluster corresponding to page_nr will be used. The cluster will be | ||
325 | * removed from free cluster list and its usage counter will be increased. | ||
326 | */ | ||
327 | static void inc_cluster_info_page(struct swap_info_struct *p, | ||
328 | struct swap_cluster_info *cluster_info, unsigned long page_nr) | ||
329 | { | ||
330 | unsigned long idx = page_nr / SWAPFILE_CLUSTER; | ||
331 | |||
332 | if (!cluster_info) | ||
333 | return; | ||
334 | if (cluster_is_free(&cluster_info[idx])) { | ||
335 | VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); | ||
336 | cluster_set_next_flag(&p->free_cluster_head, | ||
337 | cluster_next(&cluster_info[idx]), 0); | ||
338 | if (cluster_next(&p->free_cluster_tail) == idx) { | ||
339 | cluster_set_null(&p->free_cluster_tail); | ||
340 | cluster_set_null(&p->free_cluster_head); | ||
341 | } | ||
342 | cluster_set_count_flag(&cluster_info[idx], 0, 0); | ||
343 | } | ||
344 | |||
345 | VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); | ||
346 | cluster_set_count(&cluster_info[idx], | ||
347 | cluster_count(&cluster_info[idx]) + 1); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * The cluster corresponding to page_nr decreases one usage. If the usage | ||
352 | * counter becomes 0, which means no page in the cluster is in using, we can | ||
353 | * optionally discard the cluster and add it to free cluster list. | ||
354 | */ | ||
355 | static void dec_cluster_info_page(struct swap_info_struct *p, | ||
356 | struct swap_cluster_info *cluster_info, unsigned long page_nr) | ||
357 | { | ||
358 | unsigned long idx = page_nr / SWAPFILE_CLUSTER; | ||
359 | |||
360 | if (!cluster_info) | ||
361 | return; | ||
362 | |||
363 | VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); | ||
364 | cluster_set_count(&cluster_info[idx], | ||
365 | cluster_count(&cluster_info[idx]) - 1); | ||
366 | |||
367 | if (cluster_count(&cluster_info[idx]) == 0) { | ||
368 | /* | ||
369 | * If the swap is discardable, prepare discard the cluster | ||
370 | * instead of free it immediately. The cluster will be freed | ||
371 | * after discard. | ||
372 | */ | ||
373 | if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == | ||
374 | (SWP_WRITEOK | SWP_PAGE_DISCARD)) { | ||
375 | swap_cluster_schedule_discard(p, idx); | ||
376 | return; | ||
377 | } | ||
378 | |||
379 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); | ||
380 | if (cluster_is_null(&p->free_cluster_head)) { | ||
381 | cluster_set_next_flag(&p->free_cluster_head, idx, 0); | ||
382 | cluster_set_next_flag(&p->free_cluster_tail, idx, 0); | ||
383 | } else { | ||
384 | unsigned int tail = cluster_next(&p->free_cluster_tail); | ||
385 | cluster_set_next(&cluster_info[tail], idx); | ||
386 | cluster_set_next_flag(&p->free_cluster_tail, idx, 0); | ||
387 | } | ||
388 | } | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * It's possible scan_swap_map() uses a free cluster in the middle of free | ||
393 | * cluster list. Avoiding such abuse to avoid list corruption. | ||
394 | */ | ||
395 | static bool | ||
396 | scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, | ||
397 | unsigned long offset) | ||
398 | { | ||
399 | struct percpu_cluster *percpu_cluster; | ||
400 | bool conflict; | ||
401 | |||
402 | offset /= SWAPFILE_CLUSTER; | ||
403 | conflict = !cluster_is_null(&si->free_cluster_head) && | ||
404 | offset != cluster_next(&si->free_cluster_head) && | ||
405 | cluster_is_free(&si->cluster_info[offset]); | ||
406 | |||
407 | if (!conflict) | ||
408 | return false; | ||
409 | |||
410 | percpu_cluster = this_cpu_ptr(si->percpu_cluster); | ||
411 | cluster_set_null(&percpu_cluster->index); | ||
412 | return true; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * Try to get a swap entry from current cpu's swap entry pool (a cluster). This | ||
417 | * might involve allocating a new cluster for current CPU too. | ||
418 | */ | ||
419 | static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, | ||
420 | unsigned long *offset, unsigned long *scan_base) | ||
421 | { | ||
422 | struct percpu_cluster *cluster; | ||
423 | bool found_free; | ||
424 | unsigned long tmp; | ||
425 | |||
426 | new_cluster: | ||
427 | cluster = this_cpu_ptr(si->percpu_cluster); | ||
428 | if (cluster_is_null(&cluster->index)) { | ||
429 | if (!cluster_is_null(&si->free_cluster_head)) { | ||
430 | cluster->index = si->free_cluster_head; | ||
431 | cluster->next = cluster_next(&cluster->index) * | ||
432 | SWAPFILE_CLUSTER; | ||
433 | } else if (!cluster_is_null(&si->discard_cluster_head)) { | ||
434 | /* | ||
435 | * we don't have free cluster but have some clusters in | ||
436 | * discarding, do discard now and reclaim them | ||
437 | */ | ||
438 | swap_do_scheduled_discard(si); | ||
439 | *scan_base = *offset = si->cluster_next; | ||
440 | goto new_cluster; | ||
441 | } else | ||
442 | return; | ||
443 | } | ||
444 | |||
445 | found_free = false; | ||
446 | |||
447 | /* | ||
448 | * Other CPUs can use our cluster if they can't find a free cluster, | ||
449 | * check if there is still free entry in the cluster | ||
450 | */ | ||
451 | tmp = cluster->next; | ||
452 | while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * | ||
453 | SWAPFILE_CLUSTER) { | ||
454 | if (!si->swap_map[tmp]) { | ||
455 | found_free = true; | ||
456 | break; | ||
457 | } | ||
458 | tmp++; | ||
459 | } | ||
460 | if (!found_free) { | ||
461 | cluster_set_null(&cluster->index); | ||
462 | goto new_cluster; | ||
463 | } | ||
464 | cluster->next = tmp + 1; | ||
465 | *offset = tmp; | ||
466 | *scan_base = tmp; | ||
467 | } | ||
186 | 468 | ||
187 | static unsigned long scan_swap_map(struct swap_info_struct *si, | 469 | static unsigned long scan_swap_map(struct swap_info_struct *si, |
188 | unsigned char usage) | 470 | unsigned char usage) |
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
191 | unsigned long scan_base; | 473 | unsigned long scan_base; |
192 | unsigned long last_in_cluster = 0; | 474 | unsigned long last_in_cluster = 0; |
193 | int latency_ration = LATENCY_LIMIT; | 475 | int latency_ration = LATENCY_LIMIT; |
194 | int found_free_cluster = 0; | ||
195 | 476 | ||
196 | /* | 477 | /* |
197 | * We try to cluster swap pages by allocating them sequentially | 478 | * We try to cluster swap pages by allocating them sequentially |
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
207 | si->flags += SWP_SCANNING; | 488 | si->flags += SWP_SCANNING; |
208 | scan_base = offset = si->cluster_next; | 489 | scan_base = offset = si->cluster_next; |
209 | 490 | ||
491 | /* SSD algorithm */ | ||
492 | if (si->cluster_info) { | ||
493 | scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); | ||
494 | goto checks; | ||
495 | } | ||
496 | |||
210 | if (unlikely(!si->cluster_nr--)) { | 497 | if (unlikely(!si->cluster_nr--)) { |
211 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { | 498 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
212 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 499 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
213 | goto checks; | 500 | goto checks; |
214 | } | 501 | } |
215 | if (si->flags & SWP_PAGE_DISCARD) { | 502 | |
216 | /* | ||
217 | * Start range check on racing allocations, in case | ||
218 | * they overlap the cluster we eventually decide on | ||
219 | * (we scan without swap_lock to allow preemption). | ||
220 | * It's hardly conceivable that cluster_nr could be | ||
221 | * wrapped during our scan, but don't depend on it. | ||
222 | */ | ||
223 | if (si->lowest_alloc) | ||
224 | goto checks; | ||
225 | si->lowest_alloc = si->max; | ||
226 | si->highest_alloc = 0; | ||
227 | } | ||
228 | spin_unlock(&si->lock); | 503 | spin_unlock(&si->lock); |
229 | 504 | ||
230 | /* | 505 | /* |
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
248 | offset -= SWAPFILE_CLUSTER - 1; | 523 | offset -= SWAPFILE_CLUSTER - 1; |
249 | si->cluster_next = offset; | 524 | si->cluster_next = offset; |
250 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 525 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
251 | found_free_cluster = 1; | ||
252 | goto checks; | 526 | goto checks; |
253 | } | 527 | } |
254 | if (unlikely(--latency_ration < 0)) { | 528 | if (unlikely(--latency_ration < 0)) { |
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
269 | offset -= SWAPFILE_CLUSTER - 1; | 543 | offset -= SWAPFILE_CLUSTER - 1; |
270 | si->cluster_next = offset; | 544 | si->cluster_next = offset; |
271 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 545 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
272 | found_free_cluster = 1; | ||
273 | goto checks; | 546 | goto checks; |
274 | } | 547 | } |
275 | if (unlikely(--latency_ration < 0)) { | 548 | if (unlikely(--latency_ration < 0)) { |
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
281 | offset = scan_base; | 554 | offset = scan_base; |
282 | spin_lock(&si->lock); | 555 | spin_lock(&si->lock); |
283 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 556 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
284 | si->lowest_alloc = 0; | ||
285 | } | 557 | } |
286 | 558 | ||
287 | checks: | 559 | checks: |
560 | if (si->cluster_info) { | ||
561 | while (scan_swap_map_ssd_cluster_conflict(si, offset)) | ||
562 | scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); | ||
563 | } | ||
288 | if (!(si->flags & SWP_WRITEOK)) | 564 | if (!(si->flags & SWP_WRITEOK)) |
289 | goto no_page; | 565 | goto no_page; |
290 | if (!si->highest_bit) | 566 | if (!si->highest_bit) |
@@ -317,62 +593,10 @@ checks: | |||
317 | si->highest_bit = 0; | 593 | si->highest_bit = 0; |
318 | } | 594 | } |
319 | si->swap_map[offset] = usage; | 595 | si->swap_map[offset] = usage; |
596 | inc_cluster_info_page(si, si->cluster_info, offset); | ||
320 | si->cluster_next = offset + 1; | 597 | si->cluster_next = offset + 1; |
321 | si->flags -= SWP_SCANNING; | 598 | si->flags -= SWP_SCANNING; |
322 | 599 | ||
323 | if (si->lowest_alloc) { | ||
324 | /* | ||
325 | * Only set when SWP_PAGE_DISCARD, and there's a scan | ||
326 | * for a free cluster in progress or just completed. | ||
327 | */ | ||
328 | if (found_free_cluster) { | ||
329 | /* | ||
330 | * To optimize wear-levelling, discard the | ||
331 | * old data of the cluster, taking care not to | ||
332 | * discard any of its pages that have already | ||
333 | * been allocated by racing tasks (offset has | ||
334 | * already stepped over any at the beginning). | ||
335 | */ | ||
336 | if (offset < si->highest_alloc && | ||
337 | si->lowest_alloc <= last_in_cluster) | ||
338 | last_in_cluster = si->lowest_alloc - 1; | ||
339 | si->flags |= SWP_DISCARDING; | ||
340 | spin_unlock(&si->lock); | ||
341 | |||
342 | if (offset < last_in_cluster) | ||
343 | discard_swap_cluster(si, offset, | ||
344 | last_in_cluster - offset + 1); | ||
345 | |||
346 | spin_lock(&si->lock); | ||
347 | si->lowest_alloc = 0; | ||
348 | si->flags &= ~SWP_DISCARDING; | ||
349 | |||
350 | smp_mb(); /* wake_up_bit advises this */ | ||
351 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
352 | |||
353 | } else if (si->flags & SWP_DISCARDING) { | ||
354 | /* | ||
355 | * Delay using pages allocated by racing tasks | ||
356 | * until the whole discard has been issued. We | ||
357 | * could defer that delay until swap_writepage, | ||
358 | * but it's easier to keep this self-contained. | ||
359 | */ | ||
360 | spin_unlock(&si->lock); | ||
361 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
362 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
363 | spin_lock(&si->lock); | ||
364 | } else { | ||
365 | /* | ||
366 | * Note pages allocated by racing tasks while | ||
367 | * scan for a free cluster is in progress, so | ||
368 | * that its final discard can exclude them. | ||
369 | */ | ||
370 | if (offset < si->lowest_alloc) | ||
371 | si->lowest_alloc = offset; | ||
372 | if (offset > si->highest_alloc) | ||
373 | si->highest_alloc = offset; | ||
374 | } | ||
375 | } | ||
376 | return offset; | 600 | return offset; |
377 | 601 | ||
378 | scan: | 602 | scan: |
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) | |||
527 | return p; | 751 | return p; |
528 | 752 | ||
529 | bad_free: | 753 | bad_free: |
530 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); | 754 | pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); |
531 | goto out; | 755 | goto out; |
532 | bad_offset: | 756 | bad_offset: |
533 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); | 757 | pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); |
534 | goto out; | 758 | goto out; |
535 | bad_device: | 759 | bad_device: |
536 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); | 760 | pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); |
537 | goto out; | 761 | goto out; |
538 | bad_nofile: | 762 | bad_nofile: |
539 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | 763 | pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); |
540 | out: | 764 | out: |
541 | return NULL; | 765 | return NULL; |
542 | } | 766 | } |
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
600 | 824 | ||
601 | /* free if no reference */ | 825 | /* free if no reference */ |
602 | if (!usage) { | 826 | if (!usage) { |
827 | dec_cluster_info_page(p, p->cluster_info, offset); | ||
603 | if (offset < p->lowest_bit) | 828 | if (offset < p->lowest_bit) |
604 | p->lowest_bit = offset; | 829 | p->lowest_bit = offset; |
605 | if (offset > p->highest_bit) | 830 | if (offset > p->highest_bit) |
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1107 | else | 1332 | else |
1108 | continue; | 1333 | continue; |
1109 | } | 1334 | } |
1110 | count = si->swap_map[i]; | 1335 | count = ACCESS_ONCE(si->swap_map[i]); |
1111 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1336 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1112 | break; | 1337 | break; |
1113 | } | 1338 | } |
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
1127 | { | 1352 | { |
1128 | struct swap_info_struct *si = swap_info[type]; | 1353 | struct swap_info_struct *si = swap_info[type]; |
1129 | struct mm_struct *start_mm; | 1354 | struct mm_struct *start_mm; |
1130 | unsigned char *swap_map; | 1355 | volatile unsigned char *swap_map; /* swap_map is accessed without |
1356 | * locking. Mark it as volatile | ||
1357 | * to prevent compiler doing | ||
1358 | * something odd. | ||
1359 | */ | ||
1131 | unsigned char swcount; | 1360 | unsigned char swcount; |
1132 | struct page *page; | 1361 | struct page *page; |
1133 | swp_entry_t entry; | 1362 | swp_entry_t entry; |
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
1178 | * reused since sys_swapoff() already disabled | 1407 | * reused since sys_swapoff() already disabled |
1179 | * allocation from here, or alloc_page() failed. | 1408 | * allocation from here, or alloc_page() failed. |
1180 | */ | 1409 | */ |
1181 | if (!*swap_map) | 1410 | swcount = *swap_map; |
1411 | /* | ||
1412 | * We don't hold lock here, so the swap entry could be | ||
1413 | * SWAP_MAP_BAD (when the cluster is discarding). | ||
1414 | * Instead of fail out, We can just skip the swap | ||
1415 | * entry because swapoff will wait for discarding | ||
1416 | * finish anyway. | ||
1417 | */ | ||
1418 | if (!swcount || swcount == SWAP_MAP_BAD) | ||
1182 | continue; | 1419 | continue; |
1183 | retval = -ENOMEM; | 1420 | retval = -ENOMEM; |
1184 | break; | 1421 | break; |
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1524 | } | 1761 | } |
1525 | 1762 | ||
1526 | static void _enable_swap_info(struct swap_info_struct *p, int prio, | 1763 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
1527 | unsigned char *swap_map) | 1764 | unsigned char *swap_map, |
1765 | struct swap_cluster_info *cluster_info) | ||
1528 | { | 1766 | { |
1529 | int i, prev; | 1767 | int i, prev; |
1530 | 1768 | ||
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1533 | else | 1771 | else |
1534 | p->prio = --least_priority; | 1772 | p->prio = --least_priority; |
1535 | p->swap_map = swap_map; | 1773 | p->swap_map = swap_map; |
1774 | p->cluster_info = cluster_info; | ||
1536 | p->flags |= SWP_WRITEOK; | 1775 | p->flags |= SWP_WRITEOK; |
1537 | atomic_long_add(p->pages, &nr_swap_pages); | 1776 | atomic_long_add(p->pages, &nr_swap_pages); |
1538 | total_swap_pages += p->pages; | 1777 | total_swap_pages += p->pages; |
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1553 | 1792 | ||
1554 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1793 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
1555 | unsigned char *swap_map, | 1794 | unsigned char *swap_map, |
1795 | struct swap_cluster_info *cluster_info, | ||
1556 | unsigned long *frontswap_map) | 1796 | unsigned long *frontswap_map) |
1557 | { | 1797 | { |
1558 | frontswap_init(p->type, frontswap_map); | 1798 | frontswap_init(p->type, frontswap_map); |
1559 | spin_lock(&swap_lock); | 1799 | spin_lock(&swap_lock); |
1560 | spin_lock(&p->lock); | 1800 | spin_lock(&p->lock); |
1561 | _enable_swap_info(p, prio, swap_map); | 1801 | _enable_swap_info(p, prio, swap_map, cluster_info); |
1562 | spin_unlock(&p->lock); | 1802 | spin_unlock(&p->lock); |
1563 | spin_unlock(&swap_lock); | 1803 | spin_unlock(&swap_lock); |
1564 | } | 1804 | } |
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p) | |||
1567 | { | 1807 | { |
1568 | spin_lock(&swap_lock); | 1808 | spin_lock(&swap_lock); |
1569 | spin_lock(&p->lock); | 1809 | spin_lock(&p->lock); |
1570 | _enable_swap_info(p, p->prio, p->swap_map); | 1810 | _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); |
1571 | spin_unlock(&p->lock); | 1811 | spin_unlock(&p->lock); |
1572 | spin_unlock(&swap_lock); | 1812 | spin_unlock(&swap_lock); |
1573 | } | 1813 | } |
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1576 | { | 1816 | { |
1577 | struct swap_info_struct *p = NULL; | 1817 | struct swap_info_struct *p = NULL; |
1578 | unsigned char *swap_map; | 1818 | unsigned char *swap_map; |
1819 | struct swap_cluster_info *cluster_info; | ||
1579 | unsigned long *frontswap_map; | 1820 | unsigned long *frontswap_map; |
1580 | struct file *swap_file, *victim; | 1821 | struct file *swap_file, *victim; |
1581 | struct address_space *mapping; | 1822 | struct address_space *mapping; |
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1651 | goto out_dput; | 1892 | goto out_dput; |
1652 | } | 1893 | } |
1653 | 1894 | ||
1895 | flush_work(&p->discard_work); | ||
1896 | |||
1654 | destroy_swap_extents(p); | 1897 | destroy_swap_extents(p); |
1655 | if (p->flags & SWP_CONTINUED) | 1898 | if (p->flags & SWP_CONTINUED) |
1656 | free_swap_count_continuations(p); | 1899 | free_swap_count_continuations(p); |
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1675 | p->max = 0; | 1918 | p->max = 0; |
1676 | swap_map = p->swap_map; | 1919 | swap_map = p->swap_map; |
1677 | p->swap_map = NULL; | 1920 | p->swap_map = NULL; |
1921 | cluster_info = p->cluster_info; | ||
1922 | p->cluster_info = NULL; | ||
1678 | p->flags = 0; | 1923 | p->flags = 0; |
1679 | frontswap_map = frontswap_map_get(p); | 1924 | frontswap_map = frontswap_map_get(p); |
1680 | frontswap_map_set(p, NULL); | 1925 | frontswap_map_set(p, NULL); |
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1682 | spin_unlock(&swap_lock); | 1927 | spin_unlock(&swap_lock); |
1683 | frontswap_invalidate_area(type); | 1928 | frontswap_invalidate_area(type); |
1684 | mutex_unlock(&swapon_mutex); | 1929 | mutex_unlock(&swapon_mutex); |
1930 | free_percpu(p->percpu_cluster); | ||
1931 | p->percpu_cluster = NULL; | ||
1685 | vfree(swap_map); | 1932 | vfree(swap_map); |
1933 | vfree(cluster_info); | ||
1686 | vfree(frontswap_map); | 1934 | vfree(frontswap_map); |
1687 | /* Destroy swap account informatin */ | 1935 | /* Destroy swap account informatin */ |
1688 | swap_cgroup_swapoff(type); | 1936 | swap_cgroup_swapoff(type); |
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1926 | int i; | 2174 | int i; |
1927 | unsigned long maxpages; | 2175 | unsigned long maxpages; |
1928 | unsigned long swapfilepages; | 2176 | unsigned long swapfilepages; |
2177 | unsigned long last_page; | ||
1929 | 2178 | ||
1930 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { | 2179 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1931 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 2180 | pr_err("Unable to find swap-space signature\n"); |
1932 | return 0; | 2181 | return 0; |
1933 | } | 2182 | } |
1934 | 2183 | ||
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1942 | } | 2191 | } |
1943 | /* Check the swap header's sub-version */ | 2192 | /* Check the swap header's sub-version */ |
1944 | if (swap_header->info.version != 1) { | 2193 | if (swap_header->info.version != 1) { |
1945 | printk(KERN_WARNING | 2194 | pr_warn("Unable to handle swap header version %d\n", |
1946 | "Unable to handle swap header version %d\n", | 2195 | swap_header->info.version); |
1947 | swap_header->info.version); | ||
1948 | return 0; | 2196 | return 0; |
1949 | } | 2197 | } |
1950 | 2198 | ||
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1968 | */ | 2216 | */ |
1969 | maxpages = swp_offset(pte_to_swp_entry( | 2217 | maxpages = swp_offset(pte_to_swp_entry( |
1970 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; | 2218 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1971 | if (maxpages > swap_header->info.last_page) { | 2219 | last_page = swap_header->info.last_page; |
1972 | maxpages = swap_header->info.last_page + 1; | 2220 | if (last_page > maxpages) { |
2221 | pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", | ||
2222 | maxpages << (PAGE_SHIFT - 10), | ||
2223 | last_page << (PAGE_SHIFT - 10)); | ||
2224 | } | ||
2225 | if (maxpages > last_page) { | ||
2226 | maxpages = last_page + 1; | ||
1973 | /* p->max is an unsigned int: don't overflow it */ | 2227 | /* p->max is an unsigned int: don't overflow it */ |
1974 | if ((unsigned int)maxpages == 0) | 2228 | if ((unsigned int)maxpages == 0) |
1975 | maxpages = UINT_MAX; | 2229 | maxpages = UINT_MAX; |
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1980 | return 0; | 2234 | return 0; |
1981 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; | 2235 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; |
1982 | if (swapfilepages && maxpages > swapfilepages) { | 2236 | if (swapfilepages && maxpages > swapfilepages) { |
1983 | printk(KERN_WARNING | 2237 | pr_warn("Swap area shorter than signature indicates\n"); |
1984 | "Swap area shorter than signature indicates\n"); | ||
1985 | return 0; | 2238 | return 0; |
1986 | } | 2239 | } |
1987 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 2240 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1995 | static int setup_swap_map_and_extents(struct swap_info_struct *p, | 2248 | static int setup_swap_map_and_extents(struct swap_info_struct *p, |
1996 | union swap_header *swap_header, | 2249 | union swap_header *swap_header, |
1997 | unsigned char *swap_map, | 2250 | unsigned char *swap_map, |
2251 | struct swap_cluster_info *cluster_info, | ||
1998 | unsigned long maxpages, | 2252 | unsigned long maxpages, |
1999 | sector_t *span) | 2253 | sector_t *span) |
2000 | { | 2254 | { |
2001 | int i; | 2255 | int i; |
2002 | unsigned int nr_good_pages; | 2256 | unsigned int nr_good_pages; |
2003 | int nr_extents; | 2257 | int nr_extents; |
2258 | unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); | ||
2259 | unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; | ||
2004 | 2260 | ||
2005 | nr_good_pages = maxpages - 1; /* omit header page */ | 2261 | nr_good_pages = maxpages - 1; /* omit header page */ |
2006 | 2262 | ||
2263 | cluster_set_null(&p->free_cluster_head); | ||
2264 | cluster_set_null(&p->free_cluster_tail); | ||
2265 | cluster_set_null(&p->discard_cluster_head); | ||
2266 | cluster_set_null(&p->discard_cluster_tail); | ||
2267 | |||
2007 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 2268 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
2008 | unsigned int page_nr = swap_header->info.badpages[i]; | 2269 | unsigned int page_nr = swap_header->info.badpages[i]; |
2009 | if (page_nr == 0 || page_nr > swap_header->info.last_page) | 2270 | if (page_nr == 0 || page_nr > swap_header->info.last_page) |
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2011 | if (page_nr < maxpages) { | 2272 | if (page_nr < maxpages) { |
2012 | swap_map[page_nr] = SWAP_MAP_BAD; | 2273 | swap_map[page_nr] = SWAP_MAP_BAD; |
2013 | nr_good_pages--; | 2274 | nr_good_pages--; |
2275 | /* | ||
2276 | * Haven't marked the cluster free yet, no list | ||
2277 | * operation involved | ||
2278 | */ | ||
2279 | inc_cluster_info_page(p, cluster_info, page_nr); | ||
2014 | } | 2280 | } |
2015 | } | 2281 | } |
2016 | 2282 | ||
2283 | /* Haven't marked the cluster free yet, no list operation involved */ | ||
2284 | for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) | ||
2285 | inc_cluster_info_page(p, cluster_info, i); | ||
2286 | |||
2017 | if (nr_good_pages) { | 2287 | if (nr_good_pages) { |
2018 | swap_map[0] = SWAP_MAP_BAD; | 2288 | swap_map[0] = SWAP_MAP_BAD; |
2289 | /* | ||
2290 | * Not mark the cluster free yet, no list | ||
2291 | * operation involved | ||
2292 | */ | ||
2293 | inc_cluster_info_page(p, cluster_info, 0); | ||
2019 | p->max = maxpages; | 2294 | p->max = maxpages; |
2020 | p->pages = nr_good_pages; | 2295 | p->pages = nr_good_pages; |
2021 | nr_extents = setup_swap_extents(p, span); | 2296 | nr_extents = setup_swap_extents(p, span); |
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2024 | nr_good_pages = p->pages; | 2299 | nr_good_pages = p->pages; |
2025 | } | 2300 | } |
2026 | if (!nr_good_pages) { | 2301 | if (!nr_good_pages) { |
2027 | printk(KERN_WARNING "Empty swap-file\n"); | 2302 | pr_warn("Empty swap-file\n"); |
2028 | return -EINVAL; | 2303 | return -EINVAL; |
2029 | } | 2304 | } |
2030 | 2305 | ||
2306 | if (!cluster_info) | ||
2307 | return nr_extents; | ||
2308 | |||
2309 | for (i = 0; i < nr_clusters; i++) { | ||
2310 | if (!cluster_count(&cluster_info[idx])) { | ||
2311 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); | ||
2312 | if (cluster_is_null(&p->free_cluster_head)) { | ||
2313 | cluster_set_next_flag(&p->free_cluster_head, | ||
2314 | idx, 0); | ||
2315 | cluster_set_next_flag(&p->free_cluster_tail, | ||
2316 | idx, 0); | ||
2317 | } else { | ||
2318 | unsigned int tail; | ||
2319 | |||
2320 | tail = cluster_next(&p->free_cluster_tail); | ||
2321 | cluster_set_next(&cluster_info[tail], idx); | ||
2322 | cluster_set_next_flag(&p->free_cluster_tail, | ||
2323 | idx, 0); | ||
2324 | } | ||
2325 | } | ||
2326 | idx++; | ||
2327 | if (idx == nr_clusters) | ||
2328 | idx = 0; | ||
2329 | } | ||
2031 | return nr_extents; | 2330 | return nr_extents; |
2032 | } | 2331 | } |
2033 | 2332 | ||
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2059 | sector_t span; | 2358 | sector_t span; |
2060 | unsigned long maxpages; | 2359 | unsigned long maxpages; |
2061 | unsigned char *swap_map = NULL; | 2360 | unsigned char *swap_map = NULL; |
2361 | struct swap_cluster_info *cluster_info = NULL; | ||
2062 | unsigned long *frontswap_map = NULL; | 2362 | unsigned long *frontswap_map = NULL; |
2063 | struct page *page = NULL; | 2363 | struct page *page = NULL; |
2064 | struct inode *inode = NULL; | 2364 | struct inode *inode = NULL; |
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2073 | if (IS_ERR(p)) | 2373 | if (IS_ERR(p)) |
2074 | return PTR_ERR(p); | 2374 | return PTR_ERR(p); |
2075 | 2375 | ||
2376 | INIT_WORK(&p->discard_work, swap_discard_work); | ||
2377 | |||
2076 | name = getname(specialfile); | 2378 | name = getname(specialfile); |
2077 | if (IS_ERR(name)) { | 2379 | if (IS_ERR(name)) { |
2078 | error = PTR_ERR(name); | 2380 | error = PTR_ERR(name); |
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2132 | error = -ENOMEM; | 2434 | error = -ENOMEM; |
2133 | goto bad_swap; | 2435 | goto bad_swap; |
2134 | } | 2436 | } |
2437 | if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { | ||
2438 | p->flags |= SWP_SOLIDSTATE; | ||
2439 | /* | ||
2440 | * select a random position to start with to help wear leveling | ||
2441 | * SSD | ||
2442 | */ | ||
2443 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | ||
2444 | |||
2445 | cluster_info = vzalloc(DIV_ROUND_UP(maxpages, | ||
2446 | SWAPFILE_CLUSTER) * sizeof(*cluster_info)); | ||
2447 | if (!cluster_info) { | ||
2448 | error = -ENOMEM; | ||
2449 | goto bad_swap; | ||
2450 | } | ||
2451 | p->percpu_cluster = alloc_percpu(struct percpu_cluster); | ||
2452 | if (!p->percpu_cluster) { | ||
2453 | error = -ENOMEM; | ||
2454 | goto bad_swap; | ||
2455 | } | ||
2456 | for_each_possible_cpu(i) { | ||
2457 | struct percpu_cluster *cluster; | ||
2458 | cluster = per_cpu_ptr(p->percpu_cluster, i); | ||
2459 | cluster_set_null(&cluster->index); | ||
2460 | } | ||
2461 | } | ||
2135 | 2462 | ||
2136 | error = swap_cgroup_swapon(p->type, maxpages); | 2463 | error = swap_cgroup_swapon(p->type, maxpages); |
2137 | if (error) | 2464 | if (error) |
2138 | goto bad_swap; | 2465 | goto bad_swap; |
2139 | 2466 | ||
2140 | nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, | 2467 | nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, |
2141 | maxpages, &span); | 2468 | cluster_info, maxpages, &span); |
2142 | if (unlikely(nr_extents < 0)) { | 2469 | if (unlikely(nr_extents < 0)) { |
2143 | error = nr_extents; | 2470 | error = nr_extents; |
2144 | goto bad_swap; | 2471 | goto bad_swap; |
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2147 | if (frontswap_enabled) | 2474 | if (frontswap_enabled) |
2148 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); | 2475 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); |
2149 | 2476 | ||
2150 | if (p->bdev) { | 2477 | if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
2151 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2478 | /* |
2152 | p->flags |= SWP_SOLIDSTATE; | 2479 | * When discard is enabled for swap with no particular |
2153 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | 2480 | * policy flagged, we set all swap discard flags here in |
2154 | } | 2481 | * order to sustain backward compatibility with older |
2155 | 2482 | * swapon(8) releases. | |
2156 | if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { | 2483 | */ |
2157 | /* | 2484 | p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | |
2158 | * When discard is enabled for swap with no particular | 2485 | SWP_PAGE_DISCARD); |
2159 | * policy flagged, we set all swap discard flags here in | ||
2160 | * order to sustain backward compatibility with older | ||
2161 | * swapon(8) releases. | ||
2162 | */ | ||
2163 | p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | | ||
2164 | SWP_PAGE_DISCARD); | ||
2165 | 2486 | ||
2166 | /* | 2487 | /* |
2167 | * By flagging sys_swapon, a sysadmin can tell us to | 2488 | * By flagging sys_swapon, a sysadmin can tell us to |
2168 | * either do single-time area discards only, or to just | 2489 | * either do single-time area discards only, or to just |
2169 | * perform discards for released swap page-clusters. | 2490 | * perform discards for released swap page-clusters. |
2170 | * Now it's time to adjust the p->flags accordingly. | 2491 | * Now it's time to adjust the p->flags accordingly. |
2171 | */ | 2492 | */ |
2172 | if (swap_flags & SWAP_FLAG_DISCARD_ONCE) | 2493 | if (swap_flags & SWAP_FLAG_DISCARD_ONCE) |
2173 | p->flags &= ~SWP_PAGE_DISCARD; | 2494 | p->flags &= ~SWP_PAGE_DISCARD; |
2174 | else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) | 2495 | else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) |
2175 | p->flags &= ~SWP_AREA_DISCARD; | 2496 | p->flags &= ~SWP_AREA_DISCARD; |
2176 | 2497 | ||
2177 | /* issue a swapon-time discard if it's still required */ | 2498 | /* issue a swapon-time discard if it's still required */ |
2178 | if (p->flags & SWP_AREA_DISCARD) { | 2499 | if (p->flags & SWP_AREA_DISCARD) { |
2179 | int err = discard_swap(p); | 2500 | int err = discard_swap(p); |
2180 | if (unlikely(err)) | 2501 | if (unlikely(err)) |
2181 | printk(KERN_ERR | 2502 | pr_err("swapon: discard_swap(%p): %d\n", |
2182 | "swapon: discard_swap(%p): %d\n", | 2503 | p, err); |
2183 | p, err); | ||
2184 | } | ||
2185 | } | 2504 | } |
2186 | } | 2505 | } |
2187 | 2506 | ||
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2190 | if (swap_flags & SWAP_FLAG_PREFER) | 2509 | if (swap_flags & SWAP_FLAG_PREFER) |
2191 | prio = | 2510 | prio = |
2192 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2511 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2193 | enable_swap_info(p, prio, swap_map, frontswap_map); | 2512 | enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); |
2194 | 2513 | ||
2195 | printk(KERN_INFO "Adding %uk swap on %s. " | 2514 | pr_info("Adding %uk swap on %s. " |
2196 | "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", | 2515 | "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", |
2197 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, | 2516 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
2198 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2517 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2211 | error = 0; | 2530 | error = 0; |
2212 | goto out; | 2531 | goto out; |
2213 | bad_swap: | 2532 | bad_swap: |
2533 | free_percpu(p->percpu_cluster); | ||
2534 | p->percpu_cluster = NULL; | ||
2214 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { | 2535 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
2215 | set_blocksize(p->bdev, p->old_block_size); | 2536 | set_blocksize(p->bdev, p->old_block_size); |
2216 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2537 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
@@ -2222,6 +2543,7 @@ bad_swap: | |||
2222 | p->flags = 0; | 2543 | p->flags = 0; |
2223 | spin_unlock(&swap_lock); | 2544 | spin_unlock(&swap_lock); |
2224 | vfree(swap_map); | 2545 | vfree(swap_map); |
2546 | vfree(cluster_info); | ||
2225 | if (swap_file) { | 2547 | if (swap_file) { |
2226 | if (inode && S_ISREG(inode->i_mode)) { | 2548 | if (inode && S_ISREG(inode->i_mode)) { |
2227 | mutex_unlock(&inode->i_mutex); | 2549 | mutex_unlock(&inode->i_mutex); |
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2291 | goto unlock_out; | 2613 | goto unlock_out; |
2292 | 2614 | ||
2293 | count = p->swap_map[offset]; | 2615 | count = p->swap_map[offset]; |
2616 | |||
2617 | /* | ||
2618 | * swapin_readahead() doesn't check if a swap entry is valid, so the | ||
2619 | * swap entry could be SWAP_MAP_BAD. Check here with lock held. | ||
2620 | */ | ||
2621 | if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { | ||
2622 | err = -ENOENT; | ||
2623 | goto unlock_out; | ||
2624 | } | ||
2625 | |||
2294 | has_cache = count & SWAP_HAS_CACHE; | 2626 | has_cache = count & SWAP_HAS_CACHE; |
2295 | count &= ~SWAP_HAS_CACHE; | 2627 | count &= ~SWAP_HAS_CACHE; |
2296 | err = 0; | 2628 | err = 0; |
@@ -2326,7 +2658,7 @@ out: | |||
2326 | return err; | 2658 | return err; |
2327 | 2659 | ||
2328 | bad_file: | 2660 | bad_file: |
2329 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2661 | pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); |
2330 | goto out; | 2662 | goto out; |
2331 | } | 2663 | } |
2332 | 2664 | ||