diff options
| -rw-r--r-- | include/trace/events/writeback.h | 24 | ||||
| -rw-r--r-- | mm/page-writeback.c | 161 |
2 files changed, 56 insertions, 129 deletions
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 5f172703eb4f..178c23508d3d 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
| @@ -104,30 +104,6 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register); | |||
| 104 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); | 104 | DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); |
| 105 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); | 105 | DEFINE_WRITEBACK_EVENT(writeback_thread_start); |
| 106 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); | 106 | DEFINE_WRITEBACK_EVENT(writeback_thread_stop); |
| 107 | DEFINE_WRITEBACK_EVENT(balance_dirty_start); | ||
| 108 | DEFINE_WRITEBACK_EVENT(balance_dirty_wait); | ||
| 109 | |||
| 110 | TRACE_EVENT(balance_dirty_written, | ||
| 111 | |||
| 112 | TP_PROTO(struct backing_dev_info *bdi, int written), | ||
| 113 | |||
| 114 | TP_ARGS(bdi, written), | ||
| 115 | |||
| 116 | TP_STRUCT__entry( | ||
| 117 | __array(char, name, 32) | ||
| 118 | __field(int, written) | ||
| 119 | ), | ||
| 120 | |||
| 121 | TP_fast_assign( | ||
| 122 | strncpy(__entry->name, dev_name(bdi->dev), 32); | ||
| 123 | __entry->written = written; | ||
| 124 | ), | ||
| 125 | |||
| 126 | TP_printk("bdi %s written %d", | ||
| 127 | __entry->name, | ||
| 128 | __entry->written | ||
| 129 | ) | ||
| 130 | ); | ||
| 131 | 107 | ||
| 132 | DECLARE_EVENT_CLASS(wbc_class, | 108 | DECLARE_EVENT_CLASS(wbc_class, |
| 133 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), | 109 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index daff320d263f..f32f25092c66 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -250,50 +250,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
| 250 | numerator, denominator); | 250 | numerator, denominator); |
| 251 | } | 251 | } |
| 252 | 252 | ||
| 253 | static inline void task_dirties_fraction(struct task_struct *tsk, | ||
| 254 | long *numerator, long *denominator) | ||
| 255 | { | ||
| 256 | prop_fraction_single(&vm_dirties, &tsk->dirties, | ||
| 257 | numerator, denominator); | ||
| 258 | } | ||
| 259 | |||
| 260 | /* | ||
| 261 | * task_dirty_limit - scale down dirty throttling threshold for one task | ||
| 262 | * | ||
| 263 | * task specific dirty limit: | ||
| 264 | * | ||
| 265 | * dirty -= (dirty/8) * p_{t} | ||
| 266 | * | ||
| 267 | * To protect light/slow dirtying tasks from heavier/fast ones, we start | ||
| 268 | * throttling individual tasks before reaching the bdi dirty limit. | ||
| 269 | * Relatively low thresholds will be allocated to heavy dirtiers. So when | ||
| 270 | * dirty pages grow large, heavy dirtiers will be throttled first, which will | ||
| 271 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | ||
| 272 | * dirty threshold may never get throttled. | ||
| 273 | */ | ||
| 274 | #define TASK_LIMIT_FRACTION 8 | ||
| 275 | static unsigned long task_dirty_limit(struct task_struct *tsk, | ||
| 276 | unsigned long bdi_dirty) | ||
| 277 | { | ||
| 278 | long numerator, denominator; | ||
| 279 | unsigned long dirty = bdi_dirty; | ||
| 280 | u64 inv = dirty / TASK_LIMIT_FRACTION; | ||
| 281 | |||
| 282 | task_dirties_fraction(tsk, &numerator, &denominator); | ||
| 283 | inv *= numerator; | ||
| 284 | do_div(inv, denominator); | ||
| 285 | |||
| 286 | dirty -= inv; | ||
| 287 | |||
| 288 | return max(dirty, bdi_dirty/2); | ||
| 289 | } | ||
| 290 | |||
| 291 | /* Minimum limit for any task */ | ||
| 292 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
| 293 | { | ||
| 294 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
| 295 | } | ||
| 296 | |||
| 297 | /* | 253 | /* |
| 298 | * | 254 | * |
| 299 | */ | 255 | */ |
| @@ -986,30 +942,36 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
| 986 | /* | 942 | /* |
| 987 | * balance_dirty_pages() must be called by processes which are generating dirty | 943 | * balance_dirty_pages() must be called by processes which are generating dirty |
| 988 | * data. It looks at the number of dirty pages in the machine and will force | 944 | * data. It looks at the number of dirty pages in the machine and will force |
| 989 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | 945 | * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
| 990 | * If we're over `background_thresh' then the writeback threads are woken to | 946 | * If we're over `background_thresh' then the writeback threads are woken to |
| 991 | * perform some writeout. | 947 | * perform some writeout. |
| 992 | */ | 948 | */ |
| 993 | static void balance_dirty_pages(struct address_space *mapping, | 949 | static void balance_dirty_pages(struct address_space *mapping, |
| 994 | unsigned long write_chunk) | 950 | unsigned long pages_dirtied) |
| 995 | { | 951 | { |
| 996 | unsigned long nr_reclaimable, bdi_nr_reclaimable; | 952 | unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
| 953 | unsigned long bdi_reclaimable; | ||
| 997 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | 954 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
| 998 | unsigned long bdi_dirty; | 955 | unsigned long bdi_dirty; |
| 999 | unsigned long freerun; | 956 | unsigned long freerun; |
| 1000 | unsigned long background_thresh; | 957 | unsigned long background_thresh; |
| 1001 | unsigned long dirty_thresh; | 958 | unsigned long dirty_thresh; |
| 1002 | unsigned long bdi_thresh; | 959 | unsigned long bdi_thresh; |
| 1003 | unsigned long task_bdi_thresh; | 960 | long pause = 0; |
| 1004 | unsigned long min_task_bdi_thresh; | ||
| 1005 | unsigned long pages_written = 0; | ||
| 1006 | unsigned long pause = 1; | ||
| 1007 | bool dirty_exceeded = false; | 961 | bool dirty_exceeded = false; |
| 1008 | bool clear_dirty_exceeded = true; | 962 | unsigned long task_ratelimit; |
| 963 | unsigned long dirty_ratelimit; | ||
| 964 | unsigned long pos_ratio; | ||
| 1009 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 965 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
| 1010 | unsigned long start_time = jiffies; | 966 | unsigned long start_time = jiffies; |
| 1011 | 967 | ||
| 1012 | for (;;) { | 968 | for (;;) { |
| 969 | /* | ||
| 970 | * Unstable writes are a feature of certain networked | ||
| 971 | * filesystems (i.e. NFS) in which data may have been | ||
| 972 | * written to the server's write cache, but has not yet | ||
| 973 | * been flushed to permanent storage. | ||
| 974 | */ | ||
| 1013 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 975 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
| 1014 | global_page_state(NR_UNSTABLE_NFS); | 976 | global_page_state(NR_UNSTABLE_NFS); |
| 1015 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); | 977 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
| @@ -1026,9 +988,23 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
| 1026 | if (nr_dirty <= freerun) | 988 | if (nr_dirty <= freerun) |
| 1027 | break; | 989 | break; |
| 1028 | 990 | ||
| 991 | if (unlikely(!writeback_in_progress(bdi))) | ||
| 992 | bdi_start_background_writeback(bdi); | ||
| 993 | |||
| 994 | /* | ||
| 995 | * bdi_thresh is not treated as some limiting factor as | ||
| 996 | * dirty_thresh, due to reasons | ||
| 997 | * - in JBOD setup, bdi_thresh can fluctuate a lot | ||
| 998 | * - in a system with HDD and USB key, the USB key may somehow | ||
| 999 | * go into state (bdi_dirty >> bdi_thresh) either because | ||
| 1000 | * bdi_dirty starts high, or because bdi_thresh drops low. | ||
| 1001 | * In this case we don't want to hard throttle the USB key | ||
| 1002 | * dirtiers for 100 seconds until bdi_dirty drops under | ||
| 1003 | * bdi_thresh. Instead the auxiliary bdi control line in | ||
| 1004 | * bdi_position_ratio() will let the dirtier task progress | ||
| 1005 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | ||
| 1006 | */ | ||
| 1029 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 1007 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
| 1030 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); | ||
| 1031 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
| 1032 | 1008 | ||
| 1033 | /* | 1009 | /* |
| 1034 | * In order to avoid the stacked BDI deadlock we need | 1010 | * In order to avoid the stacked BDI deadlock we need |
| @@ -1040,57 +1016,41 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
| 1040 | * actually dirty; with m+n sitting in the percpu | 1016 | * actually dirty; with m+n sitting in the percpu |
| 1041 | * deltas. | 1017 | * deltas. |
| 1042 | */ | 1018 | */ |
| 1043 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { | 1019 | if (bdi_thresh < 2 * bdi_stat_error(bdi)) { |
| 1044 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 1020 | bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
| 1045 | bdi_dirty = bdi_nr_reclaimable + | 1021 | bdi_dirty = bdi_reclaimable + |
| 1046 | bdi_stat_sum(bdi, BDI_WRITEBACK); | 1022 | bdi_stat_sum(bdi, BDI_WRITEBACK); |
| 1047 | } else { | 1023 | } else { |
| 1048 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 1024 | bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
| 1049 | bdi_dirty = bdi_nr_reclaimable + | 1025 | bdi_dirty = bdi_reclaimable + |
| 1050 | bdi_stat(bdi, BDI_WRITEBACK); | 1026 | bdi_stat(bdi, BDI_WRITEBACK); |
| 1051 | } | 1027 | } |
| 1052 | 1028 | ||
| 1053 | /* | 1029 | dirty_exceeded = (bdi_dirty > bdi_thresh) || |
| 1054 | * The bdi thresh is somehow "soft" limit derived from the | ||
| 1055 | * global "hard" limit. The former helps to prevent heavy IO | ||
| 1056 | * bdi or process from holding back light ones; The latter is | ||
| 1057 | * the last resort safeguard. | ||
| 1058 | */ | ||
| 1059 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || | ||
| 1060 | (nr_dirty > dirty_thresh); | 1030 | (nr_dirty > dirty_thresh); |
| 1061 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && | 1031 | if (dirty_exceeded && !bdi->dirty_exceeded) |
| 1062 | (nr_dirty <= dirty_thresh); | ||
| 1063 | |||
| 1064 | if (!dirty_exceeded) | ||
| 1065 | break; | ||
| 1066 | |||
| 1067 | if (!bdi->dirty_exceeded) | ||
| 1068 | bdi->dirty_exceeded = 1; | 1032 | bdi->dirty_exceeded = 1; |
| 1069 | 1033 | ||
| 1070 | bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, | 1034 | bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, |
| 1071 | nr_dirty, bdi_thresh, bdi_dirty, | 1035 | nr_dirty, bdi_thresh, bdi_dirty, |
| 1072 | start_time); | 1036 | start_time); |
| 1073 | 1037 | ||
| 1074 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 1038 | dirty_ratelimit = bdi->dirty_ratelimit; |
| 1075 | * Unstable writes are a feature of certain networked | 1039 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
| 1076 | * filesystems (i.e. NFS) in which data may have been | 1040 | background_thresh, nr_dirty, |
| 1077 | * written to the server's write cache, but has not yet | 1041 | bdi_thresh, bdi_dirty); |
| 1078 | * been flushed to permanent storage. | 1042 | if (unlikely(pos_ratio == 0)) { |
| 1079 | * Only move pages to writeback if this bdi is over its | 1043 | pause = MAX_PAUSE; |
| 1080 | * threshold otherwise wait until the disk writes catch | 1044 | goto pause; |
| 1081 | * up. | ||
| 1082 | */ | ||
| 1083 | trace_balance_dirty_start(bdi); | ||
| 1084 | if (bdi_nr_reclaimable > task_bdi_thresh) { | ||
| 1085 | pages_written += writeback_inodes_wb(&bdi->wb, | ||
| 1086 | write_chunk); | ||
| 1087 | trace_balance_dirty_written(bdi, pages_written); | ||
| 1088 | if (pages_written >= write_chunk) | ||
| 1089 | break; /* We've done our duty */ | ||
| 1090 | } | 1045 | } |
| 1046 | task_ratelimit = (u64)dirty_ratelimit * | ||
| 1047 | pos_ratio >> RATELIMIT_CALC_SHIFT; | ||
| 1048 | pause = (HZ * pages_dirtied) / (task_ratelimit | 1); | ||
| 1049 | pause = min_t(long, pause, MAX_PAUSE); | ||
| 1050 | |||
| 1051 | pause: | ||
| 1091 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1052 | __set_current_state(TASK_UNINTERRUPTIBLE); |
| 1092 | io_schedule_timeout(pause); | 1053 | io_schedule_timeout(pause); |
| 1093 | trace_balance_dirty_wait(bdi); | ||
| 1094 | 1054 | ||
| 1095 | dirty_thresh = hard_dirty_limit(dirty_thresh); | 1055 | dirty_thresh = hard_dirty_limit(dirty_thresh); |
| 1096 | /* | 1056 | /* |
| @@ -1099,22 +1059,11 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
| 1099 | * 200ms is typically more than enough to curb heavy dirtiers; | 1059 | * 200ms is typically more than enough to curb heavy dirtiers; |
| 1100 | * (b) the pause time limit makes the dirtiers more responsive. | 1060 | * (b) the pause time limit makes the dirtiers more responsive. |
| 1101 | */ | 1061 | */ |
| 1102 | if (nr_dirty < dirty_thresh && | 1062 | if (nr_dirty < dirty_thresh) |
| 1103 | bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && | ||
| 1104 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
| 1105 | break; | 1063 | break; |
| 1106 | |||
| 1107 | /* | ||
| 1108 | * Increase the delay for each loop, up to our previous | ||
| 1109 | * default of taking a 100ms nap. | ||
| 1110 | */ | ||
| 1111 | pause <<= 1; | ||
| 1112 | if (pause > HZ / 10) | ||
| 1113 | pause = HZ / 10; | ||
| 1114 | } | 1064 | } |
| 1115 | 1065 | ||
| 1116 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ | 1066 | if (!dirty_exceeded && bdi->dirty_exceeded) |
| 1117 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
| 1118 | bdi->dirty_exceeded = 0; | 1067 | bdi->dirty_exceeded = 0; |
| 1119 | 1068 | ||
| 1120 | current->nr_dirtied = 0; | 1069 | current->nr_dirtied = 0; |
| @@ -1131,8 +1080,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
| 1131 | * In normal mode, we start background writeout at the lower | 1080 | * In normal mode, we start background writeout at the lower |
| 1132 | * background_thresh, to keep the amount of dirty memory low. | 1081 | * background_thresh, to keep the amount of dirty memory low. |
| 1133 | */ | 1082 | */ |
| 1134 | if ((laptop_mode && pages_written) || | 1083 | if (laptop_mode) |
| 1135 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 1084 | return; |
| 1085 | |||
| 1086 | if (nr_reclaimable > background_thresh) | ||
| 1136 | bdi_start_background_writeback(bdi); | 1087 | bdi_start_background_writeback(bdi); |
| 1137 | } | 1088 | } |
| 1138 | 1089 | ||
