aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 13:13:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 13:13:35 -0400
commit4de13d7aa8f4d02f4dc99d4609575659f92b3c5a (patch)
tree3bc9729eabe79c6164cd29a5d605000bc82bf837 /mm
parent5af43c24ca59a448c9312dd4a4a51d27ec3b9a73 (diff)
parentb8d4a5bf6a049303a29a3275f463f09a490b50ea (diff)
Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block
Pull block core updates from Jens Axboe: - Major bit is Kents prep work for immutable bio vecs. - Stable candidate fix for a scheduling-while-atomic in the queue bypass operation. - Fix for the hang on exceeded rq->datalen 32-bit unsigned when merging discard bios. - Tejuns changes to convert the writeback thread pool to the generic workqueue mechanism. - Runtime PM framework, SCSI patches exists on top of these in James' tree. - A few random fixes. * 'for-3.10/core' of git://git.kernel.dk/linux-block: (40 commits) relay: move remove_buf_file inside relay_close_buf partitions/efi.c: replace useless kzalloc's by kmalloc's fs/block_dev.c: fix iov_shorten() criteria in blkdev_aio_read() block: fix max discard sectors limit blkcg: fix "scheduling while atomic" in blk_queue_bypass_start Documentation: cfq-iosched: update documentation help for cfq tunables writeback: expose the bdi_wq workqueue writeback: replace custom worker pool implementation with unbound workqueue writeback: remove unused bdi_pending_list aoe: Fix unitialized var usage bio-integrity: Add explicit field for owner of bip_buf block: Add an explicit bio flag for bios that own their bvec block: Add bio_alloc_pages() block: Convert some code to bio_for_each_segment_all() block: Add bio_for_each_segment_all() bounce: Refactor __blk_queue_bounce to not use bi_io_vec raid1: use bio_copy_data() pktcdvd: Use bio_reset() in disabled code to kill bi_idx usage pktcdvd: use bio_copy_data() block: Add bio_copy_data() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c259
-rw-r--r--mm/bounce.c75
-rw-r--r--mm/page_io.c1
3 files changed, 49 insertions, 286 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41733c5dc820..502517492258 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
31static struct class *bdi_class; 31static struct class *bdi_class;
32 32
33/* 33/*
34 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as 34 * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
35 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
36 * locking. 35 * locking.
37 */ 36 */
38DEFINE_SPINLOCK(bdi_lock); 37DEFINE_SPINLOCK(bdi_lock);
39LIST_HEAD(bdi_list); 38LIST_HEAD(bdi_list);
40LIST_HEAD(bdi_pending_list); 39
40/* bdi_wq serves all asynchronous writeback tasks */
41struct workqueue_struct *bdi_wq;
41 42
42void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 43void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
43{ 44{
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
257{ 258{
258 int err; 259 int err;
259 260
261 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
262 WQ_UNBOUND | WQ_SYSFS, 0);
263 if (!bdi_wq)
264 return -ENOMEM;
265
260 err = bdi_init(&default_backing_dev_info); 266 err = bdi_init(&default_backing_dev_info);
261 if (!err) 267 if (!err)
262 bdi_register(&default_backing_dev_info, NULL, "default"); 268 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
271 return wb_has_dirty_io(&bdi->wb); 277 return wb_has_dirty_io(&bdi->wb);
272} 278}
273 279
274static void wakeup_timer_fn(unsigned long data)
275{
276 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
277
278 spin_lock_bh(&bdi->wb_lock);
279 if (bdi->wb.task) {
280 trace_writeback_wake_thread(bdi);
281 wake_up_process(bdi->wb.task);
282 } else if (bdi->dev) {
283 /*
284 * When bdi tasks are inactive for long time, they are killed.
285 * In this case we have to wake-up the forker thread which
286 * should create and run the bdi thread.
287 */
288 trace_writeback_wake_forker_thread(bdi);
289 wake_up_process(default_backing_dev_info.wb.task);
290 }
291 spin_unlock_bh(&bdi->wb_lock);
292}
293
294/* 280/*
295 * This function is used when the first inode for this bdi is marked dirty. It 281 * This function is used when the first inode for this bdi is marked dirty. It
296 * wakes-up the corresponding bdi thread which should then take care of the 282 * wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
307 unsigned long timeout; 293 unsigned long timeout;
308 294
309 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 295 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
310 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); 296 mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
311}
312
313/*
314 * Calculate the longest interval (jiffies) bdi threads are allowed to be
315 * inactive.
316 */
317static unsigned long bdi_longest_inactive(void)
318{
319 unsigned long interval;
320
321 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
322 return max(5UL * 60 * HZ, interval);
323}
324
325/*
326 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
327 * shutdown
328 */
329static void bdi_clear_pending(struct backing_dev_info *bdi)
330{
331 clear_bit(BDI_pending, &bdi->state);
332 smp_mb__after_clear_bit();
333 wake_up_bit(&bdi->state, BDI_pending);
334}
335
336static int bdi_forker_thread(void *ptr)
337{
338 struct bdi_writeback *me = ptr;
339
340 current->flags |= PF_SWAPWRITE;
341 set_freezable();
342
343 /*
344 * Our parent may run at a different priority, just set us to normal
345 */
346 set_user_nice(current, 0);
347
348 for (;;) {
349 struct task_struct *task = NULL;
350 struct backing_dev_info *bdi;
351 enum {
352 NO_ACTION, /* Nothing to do */
353 FORK_THREAD, /* Fork bdi thread */
354 KILL_THREAD, /* Kill inactive bdi thread */
355 } action = NO_ACTION;
356
357 /*
358 * Temporary measure, we want to make sure we don't see
359 * dirty data on the default backing_dev_info
360 */
361 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
362 del_timer(&me->wakeup_timer);
363 wb_do_writeback(me, 0);
364 }
365
366 spin_lock_bh(&bdi_lock);
367 /*
368 * In the following loop we are going to check whether we have
369 * some work to do without any synchronization with tasks
370 * waking us up to do work for them. Set the task state here
371 * so that we don't miss wakeups after verifying conditions.
372 */
373 set_current_state(TASK_INTERRUPTIBLE);
374
375 list_for_each_entry(bdi, &bdi_list, bdi_list) {
376 bool have_dirty_io;
377
378 if (!bdi_cap_writeback_dirty(bdi) ||
379 bdi_cap_flush_forker(bdi))
380 continue;
381
382 WARN(!test_bit(BDI_registered, &bdi->state),
383 "bdi %p/%s is not registered!\n", bdi, bdi->name);
384
385 have_dirty_io = !list_empty(&bdi->work_list) ||
386 wb_has_dirty_io(&bdi->wb);
387
388 /*
389 * If the bdi has work to do, but the thread does not
390 * exist - create it.
391 */
392 if (!bdi->wb.task && have_dirty_io) {
393 /*
394 * Set the pending bit - if someone will try to
395 * unregister this bdi - it'll wait on this bit.
396 */
397 set_bit(BDI_pending, &bdi->state);
398 action = FORK_THREAD;
399 break;
400 }
401
402 spin_lock(&bdi->wb_lock);
403
404 /*
405 * If there is no work to do and the bdi thread was
406 * inactive long enough - kill it. The wb_lock is taken
407 * to make sure no-one adds more work to this bdi and
408 * wakes the bdi thread up.
409 */
410 if (bdi->wb.task && !have_dirty_io &&
411 time_after(jiffies, bdi->wb.last_active +
412 bdi_longest_inactive())) {
413 task = bdi->wb.task;
414 bdi->wb.task = NULL;
415 spin_unlock(&bdi->wb_lock);
416 set_bit(BDI_pending, &bdi->state);
417 action = KILL_THREAD;
418 break;
419 }
420 spin_unlock(&bdi->wb_lock);
421 }
422 spin_unlock_bh(&bdi_lock);
423
424 /* Keep working if default bdi still has things to do */
425 if (!list_empty(&me->bdi->work_list))
426 __set_current_state(TASK_RUNNING);
427
428 switch (action) {
429 case FORK_THREAD:
430 __set_current_state(TASK_RUNNING);
431 task = kthread_create(bdi_writeback_thread, &bdi->wb,
432 "flush-%s", dev_name(bdi->dev));
433 if (IS_ERR(task)) {
434 /*
435 * If thread creation fails, force writeout of
436 * the bdi from the thread. Hopefully 1024 is
437 * large enough for efficient IO.
438 */
439 writeback_inodes_wb(&bdi->wb, 1024,
440 WB_REASON_FORKER_THREAD);
441 } else {
442 /*
443 * The spinlock makes sure we do not lose
444 * wake-ups when racing with 'bdi_queue_work()'.
445 * And as soon as the bdi thread is visible, we
446 * can start it.
447 */
448 spin_lock_bh(&bdi->wb_lock);
449 bdi->wb.task = task;
450 spin_unlock_bh(&bdi->wb_lock);
451 wake_up_process(task);
452 }
453 bdi_clear_pending(bdi);
454 break;
455
456 case KILL_THREAD:
457 __set_current_state(TASK_RUNNING);
458 kthread_stop(task);
459 bdi_clear_pending(bdi);
460 break;
461
462 case NO_ACTION:
463 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
464 /*
465 * There are no dirty data. The only thing we
466 * should now care about is checking for
467 * inactive bdi threads and killing them. Thus,
468 * let's sleep for longer time, save energy and
469 * be friendly for battery-driven devices.
470 */
471 schedule_timeout(bdi_longest_inactive());
472 else
473 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
474 try_to_freeze();
475 break;
476 }
477 }
478
479 return 0;
480} 297}
481 298
482/* 299/*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
489 spin_unlock_bh(&bdi_lock); 306 spin_unlock_bh(&bdi_lock);
490 307
491 synchronize_rcu_expedited(); 308 synchronize_rcu_expedited();
309
310 /* bdi_list is now unused, clear it to mark @bdi dying */
311 INIT_LIST_HEAD(&bdi->bdi_list);
492} 312}
493 313
494int bdi_register(struct backing_dev_info *bdi, struct device *parent, 314int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
508 328
509 bdi->dev = dev; 329 bdi->dev = dev;
510 330
511 /*
512 * Just start the forker thread for our default backing_dev_info,
513 * and add other bdi's to the list. They will get a thread created
514 * on-demand when they need it.
515 */
516 if (bdi_cap_flush_forker(bdi)) {
517 struct bdi_writeback *wb = &bdi->wb;
518
519 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
520 dev_name(dev));
521 if (IS_ERR(wb->task))
522 return PTR_ERR(wb->task);
523 }
524
525 bdi_debug_register(bdi, dev_name(dev)); 331 bdi_debug_register(bdi, dev_name(dev));
526 set_bit(BDI_registered, &bdi->state); 332 set_bit(BDI_registered, &bdi->state);
527 333
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
545 */ 351 */
546static void bdi_wb_shutdown(struct backing_dev_info *bdi) 352static void bdi_wb_shutdown(struct backing_dev_info *bdi)
547{ 353{
548 struct task_struct *task;
549
550 if (!bdi_cap_writeback_dirty(bdi)) 354 if (!bdi_cap_writeback_dirty(bdi))
551 return; 355 return;
552 356
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
556 bdi_remove_from_list(bdi); 360 bdi_remove_from_list(bdi);
557 361
558 /* 362 /*
559 * If setup is pending, wait for that to complete first 363 * Drain work list and shutdown the delayed_work. At this point,
364 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
365 * is dying and its work_list needs to be drained no matter what.
560 */ 366 */
561 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 367 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
562 TASK_UNINTERRUPTIBLE); 368 flush_delayed_work(&bdi->wb.dwork);
369 WARN_ON(!list_empty(&bdi->work_list));
563 370
564 /* 371 /*
565 * Finally, kill the kernel thread. We don't need to be RCU 372 * This shouldn't be necessary unless @bdi for some reason has
566 * safe anymore, since the bdi is gone from visibility. 373 * unflushed dirty IO after work_list is drained. Do it anyway
374 * just in case.
567 */ 375 */
568 spin_lock_bh(&bdi->wb_lock); 376 cancel_delayed_work_sync(&bdi->wb.dwork);
569 task = bdi->wb.task;
570 bdi->wb.task = NULL;
571 spin_unlock_bh(&bdi->wb_lock);
572
573 if (task)
574 kthread_stop(task);
575} 377}
576 378
577/* 379/*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
597 bdi_set_min_ratio(bdi, 0); 399 bdi_set_min_ratio(bdi, 0);
598 trace_writeback_bdi_unregister(bdi); 400 trace_writeback_bdi_unregister(bdi);
599 bdi_prune_sb(bdi); 401 bdi_prune_sb(bdi);
600 del_timer_sync(&bdi->wb.wakeup_timer);
601 402
602 if (!bdi_cap_flush_forker(bdi)) 403 bdi_wb_shutdown(bdi);
603 bdi_wb_shutdown(bdi);
604 bdi_debug_unregister(bdi); 404 bdi_debug_unregister(bdi);
605 405
606 spin_lock_bh(&bdi->wb_lock); 406 spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
622 INIT_LIST_HEAD(&wb->b_io); 422 INIT_LIST_HEAD(&wb->b_io);
623 INIT_LIST_HEAD(&wb->b_more_io); 423 INIT_LIST_HEAD(&wb->b_more_io);
624 spin_lock_init(&wb->list_lock); 424 spin_lock_init(&wb->list_lock);
625 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 425 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
626} 426}
627 427
628/* 428/*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
695 bdi_unregister(bdi); 495 bdi_unregister(bdi);
696 496
697 /* 497 /*
698 * If bdi_unregister() had already been called earlier, the 498 * If bdi_unregister() had already been called earlier, the dwork
699 * wakeup_timer could still be armed because bdi_prune_sb() 499 * could still be pending because bdi_prune_sb() can race with the
700 * can race with the bdi_wakeup_thread_delayed() calls from 500 * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
701 * __mark_inode_dirty().
702 */ 501 */
703 del_timer_sync(&bdi->wb.wakeup_timer); 502 cancel_delayed_work_sync(&bdi->wb.dwork);
704 503
705 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 504 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
706 percpu_counter_destroy(&bdi->bdi_stat[i]); 505 percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bounce.c b/mm/bounce.c
index a5c2ec3589cb..c9f0a4339a7d 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
101 struct bio_vec *tovec, *fromvec; 101 struct bio_vec *tovec, *fromvec;
102 int i; 102 int i;
103 103
104 __bio_for_each_segment(tovec, to, i, 0) { 104 bio_for_each_segment(tovec, to, i) {
105 fromvec = from->bi_io_vec + i; 105 fromvec = from->bi_io_vec + i;
106 106
107 /* 107 /*
@@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
134 /* 134 /*
135 * free up bounce indirect pages used 135 * free up bounce indirect pages used
136 */ 136 */
137 __bio_for_each_segment(bvec, bio, i, 0) { 137 bio_for_each_segment_all(bvec, bio, i) {
138 org_vec = bio_orig->bi_io_vec + i; 138 org_vec = bio_orig->bi_io_vec + i;
139 if (bvec->bv_page == org_vec->bv_page) 139 if (bvec->bv_page == org_vec->bv_page)
140 continue; 140 continue;
@@ -199,78 +199,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
199static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 199static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
200 mempool_t *pool, int force) 200 mempool_t *pool, int force)
201{ 201{
202 struct page *page; 202 struct bio *bio;
203 struct bio *bio = NULL; 203 int rw = bio_data_dir(*bio_orig);
204 int i, rw = bio_data_dir(*bio_orig);
205 struct bio_vec *to, *from; 204 struct bio_vec *to, *from;
205 unsigned i;
206 206
207 bio_for_each_segment(from, *bio_orig, i) { 207 bio_for_each_segment(from, *bio_orig, i)
208 page = from->bv_page; 208 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
209 goto bounce;
209 210
210 /* 211 return;
211 * is destination page below bounce pfn? 212bounce:
212 */ 213 bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
213 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
214 continue;
215
216 /*
217 * irk, bounce it
218 */
219 if (!bio) {
220 unsigned int cnt = (*bio_orig)->bi_vcnt;
221 214
222 bio = bio_alloc(GFP_NOIO, cnt); 215 bio_for_each_segment_all(to, bio, i) {
223 memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); 216 struct page *page = to->bv_page;
224 }
225
226 217
227 to = bio->bi_io_vec + i; 218 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
219 continue;
228 220
229 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
230 to->bv_len = from->bv_len;
231 to->bv_offset = from->bv_offset;
232 inc_zone_page_state(to->bv_page, NR_BOUNCE); 221 inc_zone_page_state(to->bv_page, NR_BOUNCE);
222 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
233 223
234 if (rw == WRITE) { 224 if (rw == WRITE) {
235 char *vto, *vfrom; 225 char *vto, *vfrom;
236 226
237 flush_dcache_page(from->bv_page); 227 flush_dcache_page(page);
228
238 vto = page_address(to->bv_page) + to->bv_offset; 229 vto = page_address(to->bv_page) + to->bv_offset;
239 vfrom = kmap(from->bv_page) + from->bv_offset; 230 vfrom = kmap_atomic(page) + to->bv_offset;
240 memcpy(vto, vfrom, to->bv_len); 231 memcpy(vto, vfrom, to->bv_len);
241 kunmap(from->bv_page); 232 kunmap_atomic(vfrom);
242 } 233 }
243 } 234 }
244 235
245 /*
246 * no pages bounced
247 */
248 if (!bio)
249 return;
250
251 trace_block_bio_bounce(q, *bio_orig); 236 trace_block_bio_bounce(q, *bio_orig);
252 237
253 /*
254 * at least one page was bounced, fill in possible non-highmem
255 * pages
256 */
257 __bio_for_each_segment(from, *bio_orig, i, 0) {
258 to = bio_iovec_idx(bio, i);
259 if (!to->bv_page) {
260 to->bv_page = from->bv_page;
261 to->bv_len = from->bv_len;
262 to->bv_offset = from->bv_offset;
263 }
264 }
265
266 bio->bi_bdev = (*bio_orig)->bi_bdev;
267 bio->bi_flags |= (1 << BIO_BOUNCED); 238 bio->bi_flags |= (1 << BIO_BOUNCED);
268 bio->bi_sector = (*bio_orig)->bi_sector;
269 bio->bi_rw = (*bio_orig)->bi_rw;
270
271 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
272 bio->bi_idx = (*bio_orig)->bi_idx;
273 bio->bi_size = (*bio_orig)->bi_size;
274 239
275 if (pool == page_pool) { 240 if (pool == page_pool) {
276 bio->bi_end_io = bounce_end_io_write; 241 bio->bi_end_io = bounce_end_io_write;
diff --git a/mm/page_io.c b/mm/page_io.c
index 06a8842a6ec6..a8a3ef45fed7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -36,7 +36,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
36 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 36 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
37 bio->bi_io_vec[0].bv_offset = 0; 37 bio->bi_io_vec[0].bv_offset = 0;
38 bio->bi_vcnt = 1; 38 bio->bi_vcnt = 1;
39 bio->bi_idx = 0;
40 bio->bi_size = PAGE_SIZE; 39 bio->bi_size = PAGE_SIZE;
41 bio->bi_end_io = end_io; 40 bio->bi_end_io = end_io;
42 } 41 }