diff options
| -rw-r--r-- | include/linux/backing-dev.h | 2 | ||||
| -rw-r--r-- | include/linux/mmzone.h | 8 | ||||
| -rw-r--r-- | include/trace/events/writeback.h | 7 | ||||
| -rw-r--r-- | mm/backing-dev.c | 61 | ||||
| -rw-r--r-- | mm/page_alloc.c | 4 | ||||
| -rw-r--r-- | mm/vmscan.c | 42 |
6 files changed, 112 insertions, 12 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 35b00746c712..f1b402a50679 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
| @@ -285,7 +285,7 @@ enum { | |||
| 285 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync); | 285 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync); |
| 286 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); | 286 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); |
| 287 | long congestion_wait(int sync, long timeout); | 287 | long congestion_wait(int sync, long timeout); |
| 288 | 288 | long wait_iff_congested(struct zone *zone, int sync, long timeout); | |
| 289 | 289 | ||
| 290 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) | 290 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) |
| 291 | { | 291 | { |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c3c17fb675ee..39c24ebe9cfd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
| @@ -423,6 +423,9 @@ struct zone { | |||
| 423 | typedef enum { | 423 | typedef enum { |
| 424 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 424 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
| 425 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ | 425 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ |
| 426 | ZONE_CONGESTED, /* zone has many dirty pages backed by | ||
| 427 | * a congested BDI | ||
| 428 | */ | ||
| 426 | } zone_flags_t; | 429 | } zone_flags_t; |
| 427 | 430 | ||
| 428 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) | 431 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) |
| @@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) | |||
| 440 | clear_bit(flag, &zone->flags); | 443 | clear_bit(flag, &zone->flags); |
| 441 | } | 444 | } |
| 442 | 445 | ||
| 446 | static inline int zone_is_reclaim_congested(const struct zone *zone) | ||
| 447 | { | ||
| 448 | return test_bit(ZONE_CONGESTED, &zone->flags); | ||
| 449 | } | ||
| 450 | |||
| 443 | static inline int zone_is_reclaim_locked(const struct zone *zone) | 451 | static inline int zone_is_reclaim_locked(const struct zone *zone) |
| 444 | { | 452 | { |
| 445 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | 453 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index d2b2654606ec..89a2b2db4375 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
| @@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, | |||
| 179 | TP_ARGS(usec_timeout, usec_delayed) | 179 | TP_ARGS(usec_timeout, usec_delayed) |
| 180 | ); | 180 | ); |
| 181 | 181 | ||
| 182 | DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, | ||
| 183 | |||
| 184 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), | ||
| 185 | |||
| 186 | TP_ARGS(usec_timeout, usec_delayed) | ||
| 187 | ); | ||
| 188 | |||
| 182 | #endif /* _TRACE_WRITEBACK_H */ | 189 | #endif /* _TRACE_WRITEBACK_H */ |
| 183 | 190 | ||
| 184 | /* This part must be outside protection */ | 191 | /* This part must be outside protection */ |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 55627306abe0..5ad3c106606b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
| @@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { | |||
| 729 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 729 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
| 730 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 730 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
| 731 | }; | 731 | }; |
| 732 | static atomic_t nr_bdi_congested[2]; | ||
| 732 | 733 | ||
| 733 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | 734 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) |
| 734 | { | 735 | { |
| @@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | |||
| 736 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | 737 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
| 737 | 738 | ||
| 738 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 739 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
| 739 | clear_bit(bit, &bdi->state); | 740 | if (test_and_clear_bit(bit, &bdi->state)) |
| 741 | atomic_dec(&nr_bdi_congested[sync]); | ||
| 740 | smp_mb__after_clear_bit(); | 742 | smp_mb__after_clear_bit(); |
| 741 | if (waitqueue_active(wqh)) | 743 | if (waitqueue_active(wqh)) |
| 742 | wake_up(wqh); | 744 | wake_up(wqh); |
| @@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) | |||
| 748 | enum bdi_state bit; | 750 | enum bdi_state bit; |
| 749 | 751 | ||
| 750 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 752 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
| 751 | set_bit(bit, &bdi->state); | 753 | if (!test_and_set_bit(bit, &bdi->state)) |
| 754 | atomic_inc(&nr_bdi_congested[sync]); | ||
| 752 | } | 755 | } |
| 753 | EXPORT_SYMBOL(set_bdi_congested); | 756 | EXPORT_SYMBOL(set_bdi_congested); |
| 754 | 757 | ||
| @@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout) | |||
| 779 | } | 782 | } |
| 780 | EXPORT_SYMBOL(congestion_wait); | 783 | EXPORT_SYMBOL(congestion_wait); |
| 781 | 784 | ||
| 785 | /** | ||
| 786 | * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes | ||
| 787 | * @zone: A zone to check if it is heavily congested | ||
| 788 | * @sync: SYNC or ASYNC IO | ||
| 789 | * @timeout: timeout in jiffies | ||
| 790 | * | ||
| 791 | * In the event of a congested backing_dev (any backing_dev) and the given | ||
| 792 | * @zone has experienced recent congestion, this waits for up to @timeout | ||
| 793 | * jiffies for either a BDI to exit congestion of the given @sync queue | ||
| 794 | * or a write to complete. | ||
| 795 | * | ||
| 796 | * In the absense of zone congestion, cond_resched() is called to yield | ||
| 797 | * the processor if necessary but otherwise does not sleep. | ||
| 798 | * | ||
| 799 | * The return value is 0 if the sleep is for the full timeout. Otherwise, | ||
| 800 | * it is the number of jiffies that were still remaining when the function | ||
| 801 | * returned. return_value == timeout implies the function did not sleep. | ||
| 802 | */ | ||
| 803 | long wait_iff_congested(struct zone *zone, int sync, long timeout) | ||
| 804 | { | ||
| 805 | long ret; | ||
| 806 | unsigned long start = jiffies; | ||
| 807 | DEFINE_WAIT(wait); | ||
| 808 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | ||
| 809 | |||
| 810 | /* | ||
| 811 | * If there is no congestion, or heavy congestion is not being | ||
| 812 | * encountered in the current zone, yield if necessary instead | ||
| 813 | * of sleeping on the congestion queue | ||
| 814 | */ | ||
| 815 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || | ||
| 816 | !zone_is_reclaim_congested(zone)) { | ||
| 817 | cond_resched(); | ||
| 818 | |||
| 819 | /* In case we scheduled, work out time remaining */ | ||
| 820 | ret = timeout - (jiffies - start); | ||
| 821 | if (ret < 0) | ||
| 822 | ret = 0; | ||
| 823 | |||
| 824 | goto out; | ||
| 825 | } | ||
| 826 | |||
| 827 | /* Sleep until uncongested or a write happens */ | ||
| 828 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); | ||
| 829 | ret = io_schedule_timeout(timeout); | ||
| 830 | finish_wait(wqh, &wait); | ||
| 831 | |||
| 832 | out: | ||
| 833 | trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), | ||
| 834 | jiffies_to_usecs(jiffies - start)); | ||
| 835 | |||
| 836 | return ret; | ||
| 837 | } | ||
| 838 | EXPORT_SYMBOL(wait_iff_congested); | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a683f819439..b13bc5e5bd7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
| 1907 | preferred_zone, migratetype); | 1907 | preferred_zone, migratetype); |
| 1908 | 1908 | ||
| 1909 | if (!page && gfp_mask & __GFP_NOFAIL) | 1909 | if (!page && gfp_mask & __GFP_NOFAIL) |
| 1910 | congestion_wait(BLK_RW_ASYNC, HZ/50); | 1910 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
| 1911 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 1911 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
| 1912 | 1912 | ||
| 1913 | return page; | 1913 | return page; |
| @@ -2095,7 +2095,7 @@ rebalance: | |||
| 2095 | pages_reclaimed += did_some_progress; | 2095 | pages_reclaimed += did_some_progress; |
| 2096 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 2096 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
| 2097 | /* Wait for some write requests to complete then retry */ | 2097 | /* Wait for some write requests to complete then retry */ |
| 2098 | congestion_wait(BLK_RW_ASYNC, HZ/50); | 2098 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
| 2099 | goto rebalance; | 2099 | goto rebalance; |
| 2100 | } | 2100 | } |
| 2101 | 2101 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 130ad0239f52..30fd658bb289 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 401 | } | 401 | } |
| 402 | if (mapping->a_ops->writepage == NULL) | 402 | if (mapping->a_ops->writepage == NULL) |
| 403 | return PAGE_ACTIVATE; | 403 | return PAGE_ACTIVATE; |
| 404 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) { | 404 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) |
| 405 | disable_lumpy_reclaim_mode(sc); | ||
| 406 | return PAGE_KEEP; | 405 | return PAGE_KEEP; |
| 407 | } | ||
| 408 | 406 | ||
| 409 | if (clear_page_dirty_for_io(page)) { | 407 | if (clear_page_dirty_for_io(page)) { |
| 410 | int res; | 408 | int res; |
| @@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
| 681 | * shrink_page_list() returns the number of reclaimed pages | 679 | * shrink_page_list() returns the number of reclaimed pages |
| 682 | */ | 680 | */ |
| 683 | static unsigned long shrink_page_list(struct list_head *page_list, | 681 | static unsigned long shrink_page_list(struct list_head *page_list, |
| 682 | struct zone *zone, | ||
| 684 | struct scan_control *sc) | 683 | struct scan_control *sc) |
| 685 | { | 684 | { |
| 686 | LIST_HEAD(ret_pages); | 685 | LIST_HEAD(ret_pages); |
| 687 | LIST_HEAD(free_pages); | 686 | LIST_HEAD(free_pages); |
| 688 | int pgactivate = 0; | 687 | int pgactivate = 0; |
| 688 | unsigned long nr_dirty = 0; | ||
| 689 | unsigned long nr_congested = 0; | ||
| 689 | unsigned long nr_reclaimed = 0; | 690 | unsigned long nr_reclaimed = 0; |
| 690 | 691 | ||
| 691 | cond_resched(); | 692 | cond_resched(); |
| @@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 705 | goto keep; | 706 | goto keep; |
| 706 | 707 | ||
| 707 | VM_BUG_ON(PageActive(page)); | 708 | VM_BUG_ON(PageActive(page)); |
| 709 | VM_BUG_ON(page_zone(page) != zone); | ||
| 708 | 710 | ||
| 709 | sc->nr_scanned++; | 711 | sc->nr_scanned++; |
| 710 | 712 | ||
| @@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 782 | } | 784 | } |
| 783 | 785 | ||
| 784 | if (PageDirty(page)) { | 786 | if (PageDirty(page)) { |
| 787 | nr_dirty++; | ||
| 788 | |||
| 785 | if (references == PAGEREF_RECLAIM_CLEAN) | 789 | if (references == PAGEREF_RECLAIM_CLEAN) |
| 786 | goto keep_locked; | 790 | goto keep_locked; |
| 787 | if (!may_enter_fs) | 791 | if (!may_enter_fs) |
| @@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 792 | /* Page is dirty, try to write it out here */ | 796 | /* Page is dirty, try to write it out here */ |
| 793 | switch (pageout(page, mapping, sc)) { | 797 | switch (pageout(page, mapping, sc)) { |
| 794 | case PAGE_KEEP: | 798 | case PAGE_KEEP: |
| 799 | nr_congested++; | ||
| 795 | goto keep_locked; | 800 | goto keep_locked; |
| 796 | case PAGE_ACTIVATE: | 801 | case PAGE_ACTIVATE: |
| 797 | goto activate_locked; | 802 | goto activate_locked; |
| @@ -902,6 +907,15 @@ keep_lumpy: | |||
| 902 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 907 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
| 903 | } | 908 | } |
| 904 | 909 | ||
| 910 | /* | ||
| 911 | * Tag a zone as congested if all the dirty pages encountered were | ||
| 912 | * backed by a congested BDI. In this case, reclaimers should just | ||
| 913 | * back off and wait for congestion to clear because further reclaim | ||
| 914 | * will encounter the same problem | ||
| 915 | */ | ||
| 916 | if (nr_dirty == nr_congested) | ||
| 917 | zone_set_flag(zone, ZONE_CONGESTED); | ||
| 918 | |||
| 905 | free_page_list(&free_pages); | 919 | free_page_list(&free_pages); |
| 906 | 920 | ||
| 907 | list_splice(&ret_pages, page_list); | 921 | list_splice(&ret_pages, page_list); |
| @@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
| 1386 | 1400 | ||
| 1387 | spin_unlock_irq(&zone->lru_lock); | 1401 | spin_unlock_irq(&zone->lru_lock); |
| 1388 | 1402 | ||
| 1389 | nr_reclaimed = shrink_page_list(&page_list, sc); | 1403 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); |
| 1390 | 1404 | ||
| 1391 | /* Check if we should syncronously wait for writeback */ | 1405 | /* Check if we should syncronously wait for writeback */ |
| 1392 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1406 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
| 1393 | set_lumpy_reclaim_mode(priority, sc, true); | 1407 | set_lumpy_reclaim_mode(priority, sc, true); |
| 1394 | nr_reclaimed += shrink_page_list(&page_list, sc); | 1408 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
| 1395 | } | 1409 | } |
| 1396 | 1410 | ||
| 1397 | local_irq_disable(); | 1411 | local_irq_disable(); |
| @@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1982 | 1996 | ||
| 1983 | /* Take a nap, wait for some writeback to complete */ | 1997 | /* Take a nap, wait for some writeback to complete */ |
| 1984 | if (!sc->hibernation_mode && sc->nr_scanned && | 1998 | if (!sc->hibernation_mode && sc->nr_scanned && |
| 1985 | priority < DEF_PRIORITY - 2) | 1999 | priority < DEF_PRIORITY - 2) { |
| 1986 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2000 | struct zone *preferred_zone; |
| 2001 | |||
| 2002 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | ||
| 2003 | NULL, &preferred_zone); | ||
| 2004 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | ||
| 2005 | } | ||
| 1987 | } | 2006 | } |
| 1988 | 2007 | ||
| 1989 | out: | 2008 | out: |
| @@ -2282,6 +2301,15 @@ loop_again: | |||
| 2282 | if (!zone_watermark_ok(zone, order, | 2301 | if (!zone_watermark_ok(zone, order, |
| 2283 | min_wmark_pages(zone), end_zone, 0)) | 2302 | min_wmark_pages(zone), end_zone, 0)) |
| 2284 | has_under_min_watermark_zone = 1; | 2303 | has_under_min_watermark_zone = 1; |
| 2304 | } else { | ||
| 2305 | /* | ||
| 2306 | * If a zone reaches its high watermark, | ||
| 2307 | * consider it to be no longer congested. It's | ||
| 2308 | * possible there are dirty pages backed by | ||
| 2309 | * congested BDIs but as pressure is relieved, | ||
| 2310 | * spectulatively avoid congestion waits | ||
| 2311 | */ | ||
| 2312 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
| 2285 | } | 2313 | } |
| 2286 | 2314 | ||
| 2287 | } | 2315 | } |
