aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/cfq-iosched.txt45
-rw-r--r--Documentation/cgroups/blkio-controller.txt28
-rw-r--r--block/blk-cgroup.c2
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-sysfs.c1
-rw-r--r--block/blk.h8
-rw-r--r--block/cfq-iosched.c103
-rw-r--r--block/elevator.c44
-rw-r--r--drivers/block/cciss.c11
-rw-r--r--drivers/block/loop.c2
-rw-r--r--drivers/block/mg_disk.c3
-rw-r--r--drivers/s390/char/tape_block.c3
-rw-r--r--fs/bio-integrity.c4
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--include/linux/elevator.h1
-rw-r--r--lib/scatterlist.c14
-rw-r--r--mm/backing-dev.c7
17 files changed, 238 insertions, 46 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
new file mode 100644
index 000000000000..e578feed6d81
--- /dev/null
+++ b/Documentation/block/cfq-iosched.txt
@@ -0,0 +1,45 @@
1CFQ ioscheduler tunables
2========================
3
4slice_idle
5----------
6This specifies how long CFQ should idle for next request on certain cfq queues
7(for sequential workloads) and service trees (for random workloads) before
8queue is expired and CFQ selects next queue to dispatch from.
9
10By default slice_idle is a non-zero value. That means by default we idle on
11queues/service trees. This can be very helpful on highly seeky media like
12single spindle SATA/SAS disks where we can cut down on overall number of
13seeks and see improved throughput.
14
15Setting slice_idle to 0 will remove all the idling on queues/service tree
16level and one should see an overall improved throughput on faster storage
17devices like multiple SATA/SAS disks in hardware RAID configuration. The down
18side is that isolation provided from WRITES also goes down and notion of
19IO priority becomes weaker.
20
21So depending on storage and workload, it might be useful to set slice_idle=0.
22In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
23keeping slice_idle enabled should be useful. For any configurations where
24there are multiple spindles behind single LUN (Host based hardware RAID
25controller or for storage arrays), setting slice_idle=0 might end up in better
26throughput and acceptable latencies.
27
28CFQ IOPS Mode for group scheduling
29===================================
30Basic CFQ design is to provide priority based time slices. Higher priority
31process gets bigger time slice and lower priority process gets smaller time
32slice. Measuring time becomes harder if storage is fast and supports NCQ and
33it would be better to dispatch multiple requests from multiple cfq queues in
34request queue at a time. In such scenario, it is not possible to measure time
35consumed by single queue accurately.
36
37What is possible though is to measure number of requests dispatched from a
38single queue and also allow dispatch from multiple cfq queue at the same time.
39This effectively becomes the fairness in terms of IOPS (IO operations per
40second).
41
42If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
43to IOPS mode and starts providing fairness in terms of number of requests
44dispatched. Note that this mode switching takes effect only for group
45scheduling. For non-cgroup users nothing should change.
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 48e0b21b0059..6919d62591d9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -217,6 +217,7 @@ Details of cgroup files
217CFQ sysfs tunable 217CFQ sysfs tunable
218================= 218=================
219/sys/block/<disk>/queue/iosched/group_isolation 219/sys/block/<disk>/queue/iosched/group_isolation
220-----------------------------------------------
220 221
221If group_isolation=1, it provides stronger isolation between groups at the 222If group_isolation=1, it provides stronger isolation between groups at the
222expense of throughput. By default group_isolation is 0. In general that 223expense of throughput. By default group_isolation is 0. In general that
@@ -243,6 +244,33 @@ By default one should run with group_isolation=0. If that is not sufficient
243and one wants stronger isolation between groups, then set group_isolation=1 244and one wants stronger isolation between groups, then set group_isolation=1
244but this will come at cost of reduced throughput. 245but this will come at cost of reduced throughput.
245 246
247/sys/block/<disk>/queue/iosched/slice_idle
248------------------------------------------
249On a faster hardware CFQ can be slow, especially with sequential workload.
250This happens because CFQ idles on a single queue and single queue might not
251drive deeper request queue depths to keep the storage busy. In such scenarios
252one can try setting slice_idle=0 and that would switch CFQ to IOPS
253(IO operations per second) mode on NCQ supporting hardware.
254
255That means CFQ will not idle between cfq queues of a cfq group and hence be
256able to driver higher queue depth and achieve better throughput. That also
257means that cfq provides fairness among groups in terms of IOPS and not in
258terms of disk time.
259
260/sys/block/<disk>/queue/iosched/group_idle
261------------------------------------------
262If one disables idling on individual cfq queues and cfq service trees by
263setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
264on the group in an attempt to provide fairness among groups.
265
266By default group_idle is same as slice_idle and does not do anything if
267slice_idle is enabled.
268
269One can experience an overall throughput drop if you have created multiple
270groups and put applications in that group which are not driving enough
271IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
272on individual groups and throughput should improve.
273
246What works 274What works
247========== 275==========
248- Currently only sync IO queues are support. All the buffered writes are 276- Currently only sync IO queues are support. All the buffered writes are
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6809645d212..2fef1ef931a0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -966,7 +966,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
966 966
967 /* Currently we do not support hierarchy deeper than two level (0,1) */ 967 /* Currently we do not support hierarchy deeper than two level (0,1) */
968 if (parent != cgroup->top_cgroup) 968 if (parent != cgroup->top_cgroup)
969 return ERR_PTR(-EINVAL); 969 return ERR_PTR(-EPERM);
970 970
971 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 971 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
972 if (!blkcg) 972 if (!blkcg)
diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..32a1c123dfb3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1198,9 +1198,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 int el_ret; 1198 int el_ret;
1199 unsigned int bytes = bio->bi_size; 1199 unsigned int bytes = bio->bi_size;
1200 const unsigned short prio = bio_prio(bio); 1200 const unsigned short prio = bio_prio(bio);
1201 const bool sync = (bio->bi_rw & REQ_SYNC); 1201 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1202 const bool unplug = (bio->bi_rw & REQ_UNPLUG); 1202 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
1203 const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1203 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
1204 int rw_flags; 1204 int rw_flags;
1205 1205
1206 if ((bio->bi_rw & REQ_HARDBARRIER) && 1206 if ((bio->bi_rw & REQ_HARDBARRIER) &&
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f5..0749b89c6885 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -511,6 +511,7 @@ int blk_register_queue(struct gendisk *disk)
511 kobject_uevent(&q->kobj, KOBJ_REMOVE); 511 kobject_uevent(&q->kobj, KOBJ_REMOVE);
512 kobject_del(&q->kobj); 512 kobject_del(&q->kobj);
513 blk_trace_remove_sysfs(disk_to_dev(disk)); 513 blk_trace_remove_sysfs(disk_to_dev(disk));
514 kobject_put(&dev->kobj);
514 return ret; 515 return ret;
515 } 516 }
516 517
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..d6b911ac002c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
142 142
143static inline int blk_cpu_to_group(int cpu) 143static inline int blk_cpu_to_group(int cpu)
144{ 144{
145 int group = NR_CPUS;
145#ifdef CONFIG_SCHED_MC 146#ifdef CONFIG_SCHED_MC
146 const struct cpumask *mask = cpu_coregroup_mask(cpu); 147 const struct cpumask *mask = cpu_coregroup_mask(cpu);
147 return cpumask_first(mask); 148 group = cpumask_first(mask);
148#elif defined(CONFIG_SCHED_SMT) 149#elif defined(CONFIG_SCHED_SMT)
149 return cpumask_first(topology_thread_cpumask(cpu)); 150 group = cpumask_first(topology_thread_cpumask(cpu));
150#else 151#else
151 return cpu; 152 return cpu;
152#endif 153#endif
154 if (likely(group < NR_CPUS))
155 return group;
156 return cpu;
153} 157}
154 158
155/* 159/*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb4086f7dfef..f65c6f01c475 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10;
30static int cfq_slice_async = HZ / 25; 30static int cfq_slice_async = HZ / 25;
31static const int cfq_slice_async_rq = 2; 31static const int cfq_slice_async_rq = 2;
32static int cfq_slice_idle = HZ / 125; 32static int cfq_slice_idle = HZ / 125;
33static int cfq_group_idle = HZ / 125;
33static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 34static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
34static const int cfq_hist_divisor = 4; 35static const int cfq_hist_divisor = 4;
35 36
@@ -147,6 +148,8 @@ struct cfq_queue {
147 struct cfq_queue *new_cfqq; 148 struct cfq_queue *new_cfqq;
148 struct cfq_group *cfqg; 149 struct cfq_group *cfqg;
149 struct cfq_group *orig_cfqg; 150 struct cfq_group *orig_cfqg;
151 /* Number of sectors dispatched from queue in single dispatch round */
152 unsigned long nr_sectors;
150}; 153};
151 154
152/* 155/*
@@ -198,6 +201,8 @@ struct cfq_group {
198 struct hlist_node cfqd_node; 201 struct hlist_node cfqd_node;
199 atomic_t ref; 202 atomic_t ref;
200#endif 203#endif
204 /* number of requests that are on the dispatch list or inside driver */
205 int dispatched;
201}; 206};
202 207
203/* 208/*
@@ -271,6 +276,7 @@ struct cfq_data {
271 unsigned int cfq_slice[2]; 276 unsigned int cfq_slice[2];
272 unsigned int cfq_slice_async_rq; 277 unsigned int cfq_slice_async_rq;
273 unsigned int cfq_slice_idle; 278 unsigned int cfq_slice_idle;
279 unsigned int cfq_group_idle;
274 unsigned int cfq_latency; 280 unsigned int cfq_latency;
275 unsigned int cfq_group_isolation; 281 unsigned int cfq_group_isolation;
276 282
@@ -378,6 +384,21 @@ CFQ_CFQQ_FNS(wait_busy);
378 &cfqg->service_trees[i][j]: NULL) \ 384 &cfqg->service_trees[i][j]: NULL) \
379 385
380 386
387static inline bool iops_mode(struct cfq_data *cfqd)
388{
389 /*
390 * If we are not idling on queues and it is a NCQ drive, parallel
391 * execution of requests is on and measuring time is not possible
392 * in most of the cases until and unless we drive shallower queue
393 * depths and that becomes a performance bottleneck. In such cases
394 * switch to start providing fairness in terms of number of IOs.
395 */
396 if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
397 return true;
398 else
399 return false;
400}
401
381static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 402static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
382{ 403{
383 if (cfq_class_idle(cfqq)) 404 if (cfq_class_idle(cfqq))
@@ -906,7 +927,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
906 slice_used = cfqq->allocated_slice; 927 slice_used = cfqq->allocated_slice;
907 } 928 }
908 929
909 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
910 return slice_used; 930 return slice_used;
911} 931}
912 932
@@ -914,19 +934,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
914 struct cfq_queue *cfqq) 934 struct cfq_queue *cfqq)
915{ 935{
916 struct cfq_rb_root *st = &cfqd->grp_service_tree; 936 struct cfq_rb_root *st = &cfqd->grp_service_tree;
917 unsigned int used_sl, charge_sl; 937 unsigned int used_sl, charge;
918 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 938 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
919 - cfqg->service_tree_idle.count; 939 - cfqg->service_tree_idle.count;
920 940
921 BUG_ON(nr_sync < 0); 941 BUG_ON(nr_sync < 0);
922 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); 942 used_sl = charge = cfq_cfqq_slice_usage(cfqq);
923 943
924 if (!cfq_cfqq_sync(cfqq) && !nr_sync) 944 if (iops_mode(cfqd))
925 charge_sl = cfqq->allocated_slice; 945 charge = cfqq->slice_dispatch;
946 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
947 charge = cfqq->allocated_slice;
926 948
927 /* Can't update vdisktime while group is on service tree */ 949 /* Can't update vdisktime while group is on service tree */
928 cfq_rb_erase(&cfqg->rb_node, st); 950 cfq_rb_erase(&cfqg->rb_node, st);
929 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); 951 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
930 __cfq_group_service_tree_add(st, cfqg); 952 __cfq_group_service_tree_add(st, cfqg);
931 953
932 /* This group is being expired. Save the context */ 954 /* This group is being expired. Save the context */
@@ -940,6 +962,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
940 962
941 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 963 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
942 st->min_vdisktime); 964 st->min_vdisktime);
965 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
966 " sect=%u", used_sl, cfqq->slice_dispatch, charge,
967 iops_mode(cfqd), cfqq->nr_sectors);
943 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 968 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
944 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 969 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
945} 970}
@@ -1587,6 +1612,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1587 cfqq->allocated_slice = 0; 1612 cfqq->allocated_slice = 0;
1588 cfqq->slice_end = 0; 1613 cfqq->slice_end = 0;
1589 cfqq->slice_dispatch = 0; 1614 cfqq->slice_dispatch = 0;
1615 cfqq->nr_sectors = 0;
1590 1616
1591 cfq_clear_cfqq_wait_request(cfqq); 1617 cfq_clear_cfqq_wait_request(cfqq);
1592 cfq_clear_cfqq_must_dispatch(cfqq); 1618 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1839,6 +1865,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1839 BUG_ON(!service_tree); 1865 BUG_ON(!service_tree);
1840 BUG_ON(!service_tree->count); 1866 BUG_ON(!service_tree->count);
1841 1867
1868 if (!cfqd->cfq_slice_idle)
1869 return false;
1870
1842 /* We never do for idle class queues. */ 1871 /* We never do for idle class queues. */
1843 if (prio == IDLE_WORKLOAD) 1872 if (prio == IDLE_WORKLOAD)
1844 return false; 1873 return false;
@@ -1863,7 +1892,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1863{ 1892{
1864 struct cfq_queue *cfqq = cfqd->active_queue; 1893 struct cfq_queue *cfqq = cfqd->active_queue;
1865 struct cfq_io_context *cic; 1894 struct cfq_io_context *cic;
1866 unsigned long sl; 1895 unsigned long sl, group_idle = 0;
1867 1896
1868 /* 1897 /*
1869 * SSD device without seek penalty, disable idling. But only do so 1898 * SSD device without seek penalty, disable idling. But only do so
@@ -1879,8 +1908,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1879 /* 1908 /*
1880 * idle is disabled, either manually or by past process history 1909 * idle is disabled, either manually or by past process history
1881 */ 1910 */
1882 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) 1911 if (!cfq_should_idle(cfqd, cfqq)) {
1883 return; 1912 /* no queue idling. Check for group idling */
1913 if (cfqd->cfq_group_idle)
1914 group_idle = cfqd->cfq_group_idle;
1915 else
1916 return;
1917 }
1884 1918
1885 /* 1919 /*
1886 * still active requests from this queue, don't idle 1920 * still active requests from this queue, don't idle
@@ -1907,13 +1941,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1907 return; 1941 return;
1908 } 1942 }
1909 1943
1944 /* There are other queues in the group, don't do group idle */
1945 if (group_idle && cfqq->cfqg->nr_cfqq > 1)
1946 return;
1947
1910 cfq_mark_cfqq_wait_request(cfqq); 1948 cfq_mark_cfqq_wait_request(cfqq);
1911 1949
1912 sl = cfqd->cfq_slice_idle; 1950 if (group_idle)
1951 sl = cfqd->cfq_group_idle;
1952 else
1953 sl = cfqd->cfq_slice_idle;
1913 1954
1914 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1955 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1915 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 1956 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1916 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1957 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
1958 group_idle ? 1 : 0);
1917} 1959}
1918 1960
1919/* 1961/*
@@ -1929,9 +1971,11 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1929 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1971 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1930 cfq_remove_request(rq); 1972 cfq_remove_request(rq);
1931 cfqq->dispatched++; 1973 cfqq->dispatched++;
1974 (RQ_CFQG(rq))->dispatched++;
1932 elv_dispatch_sort(q, rq); 1975 elv_dispatch_sort(q, rq);
1933 1976
1934 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1977 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1978 cfqq->nr_sectors += blk_rq_sectors(rq);
1935 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 1979 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1936 rq_data_dir(rq), rq_is_sync(rq)); 1980 rq_data_dir(rq), rq_is_sync(rq));
1937} 1981}
@@ -2198,7 +2242,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2198 cfqq = NULL; 2242 cfqq = NULL;
2199 goto keep_queue; 2243 goto keep_queue;
2200 } else 2244 } else
2201 goto expire; 2245 goto check_group_idle;
2202 } 2246 }
2203 2247
2204 /* 2248 /*
@@ -2226,8 +2270,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2226 * flight or is idling for a new request, allow either of these 2270 * flight or is idling for a new request, allow either of these
2227 * conditions to happen (or time out) before selecting a new queue. 2271 * conditions to happen (or time out) before selecting a new queue.
2228 */ 2272 */
2229 if (timer_pending(&cfqd->idle_slice_timer) || 2273 if (timer_pending(&cfqd->idle_slice_timer)) {
2230 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { 2274 cfqq = NULL;
2275 goto keep_queue;
2276 }
2277
2278 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2279 cfqq = NULL;
2280 goto keep_queue;
2281 }
2282
2283 /*
2284 * If group idle is enabled and there are requests dispatched from
2285 * this group, wait for requests to complete.
2286 */
2287check_group_idle:
2288 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
2289 && cfqq->cfqg->dispatched) {
2231 cfqq = NULL; 2290 cfqq = NULL;
2232 goto keep_queue; 2291 goto keep_queue;
2233 } 2292 }
@@ -3375,6 +3434,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3375 WARN_ON(!cfqq->dispatched); 3434 WARN_ON(!cfqq->dispatched);
3376 cfqd->rq_in_driver--; 3435 cfqd->rq_in_driver--;
3377 cfqq->dispatched--; 3436 cfqq->dispatched--;
3437 (RQ_CFQG(rq))->dispatched--;
3378 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3438 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3379 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3439 rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3380 rq_data_dir(rq), rq_is_sync(rq)); 3440 rq_data_dir(rq), rq_is_sync(rq));
@@ -3404,7 +3464,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3404 * the queue. 3464 * the queue.
3405 */ 3465 */
3406 if (cfq_should_wait_busy(cfqd, cfqq)) { 3466 if (cfq_should_wait_busy(cfqd, cfqq)) {
3407 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; 3467 unsigned long extend_sl = cfqd->cfq_slice_idle;
3468 if (!cfqd->cfq_slice_idle)
3469 extend_sl = cfqd->cfq_group_idle;
3470 cfqq->slice_end = jiffies + extend_sl;
3408 cfq_mark_cfqq_wait_busy(cfqq); 3471 cfq_mark_cfqq_wait_busy(cfqq);
3409 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 3472 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3410 } 3473 }
@@ -3850,6 +3913,7 @@ static void *cfq_init_queue(struct request_queue *q)
3850 cfqd->cfq_slice[1] = cfq_slice_sync; 3913 cfqd->cfq_slice[1] = cfq_slice_sync;
3851 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3914 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3852 cfqd->cfq_slice_idle = cfq_slice_idle; 3915 cfqd->cfq_slice_idle = cfq_slice_idle;
3916 cfqd->cfq_group_idle = cfq_group_idle;
3853 cfqd->cfq_latency = 1; 3917 cfqd->cfq_latency = 1;
3854 cfqd->cfq_group_isolation = 0; 3918 cfqd->cfq_group_isolation = 0;
3855 cfqd->hw_tag = -1; 3919 cfqd->hw_tag = -1;
@@ -3922,6 +3986,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3922SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 3986SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3923SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 3987SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3924SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 3988SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3989SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3925SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 3990SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3926SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3991SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3927SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3992SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3954,6 +4019,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3954STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4019STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3955 UINT_MAX, 0); 4020 UINT_MAX, 0);
3956STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4021STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4022STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
3957STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4023STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3958STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4024STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3959STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4025STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3975,6 +4041,7 @@ static struct elv_fs_entry cfq_attrs[] = {
3975 CFQ_ATTR(slice_async), 4041 CFQ_ATTR(slice_async),
3976 CFQ_ATTR(slice_async_rq), 4042 CFQ_ATTR(slice_async_rq),
3977 CFQ_ATTR(slice_idle), 4043 CFQ_ATTR(slice_idle),
4044 CFQ_ATTR(group_idle),
3978 CFQ_ATTR(low_latency), 4045 CFQ_ATTR(low_latency),
3979 CFQ_ATTR(group_isolation), 4046 CFQ_ATTR(group_isolation),
3980 __ATTR_NULL 4047 __ATTR_NULL
@@ -4028,6 +4095,12 @@ static int __init cfq_init(void)
4028 if (!cfq_slice_idle) 4095 if (!cfq_slice_idle)
4029 cfq_slice_idle = 1; 4096 cfq_slice_idle = 1;
4030 4097
4098#ifdef CONFIG_CFQ_GROUP_IOSCHED
4099 if (!cfq_group_idle)
4100 cfq_group_idle = 1;
4101#else
4102 cfq_group_idle = 0;
4103#endif
4031 if (cfq_slab_setup()) 4104 if (cfq_slab_setup())
4032 return -ENOMEM; 4105 return -ENOMEM;
4033 4106
diff --git a/block/elevator.c b/block/elevator.c
index ec585c9554d3..205b09a5bd9e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1009,18 +1009,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1009{ 1009{
1010 struct elevator_queue *old_elevator, *e; 1010 struct elevator_queue *old_elevator, *e;
1011 void *data; 1011 void *data;
1012 int err;
1012 1013
1013 /* 1014 /*
1014 * Allocate new elevator 1015 * Allocate new elevator
1015 */ 1016 */
1016 e = elevator_alloc(q, new_e); 1017 e = elevator_alloc(q, new_e);
1017 if (!e) 1018 if (!e)
1018 return 0; 1019 return -ENOMEM;
1019 1020
1020 data = elevator_init_queue(q, e); 1021 data = elevator_init_queue(q, e);
1021 if (!data) { 1022 if (!data) {
1022 kobject_put(&e->kobj); 1023 kobject_put(&e->kobj);
1023 return 0; 1024 return -ENOMEM;
1024 } 1025 }
1025 1026
1026 /* 1027 /*
@@ -1043,7 +1044,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1043 1044
1044 __elv_unregister_queue(old_elevator); 1045 __elv_unregister_queue(old_elevator);
1045 1046
1046 if (elv_register_queue(q)) 1047 err = elv_register_queue(q);
1048 if (err)
1047 goto fail_register; 1049 goto fail_register;
1048 1050
1049 /* 1051 /*
@@ -1056,7 +1058,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1056 1058
1057 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); 1059 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
1058 1060
1059 return 1; 1061 return 0;
1060 1062
1061fail_register: 1063fail_register:
1062 /* 1064 /*
@@ -1071,17 +1073,19 @@ fail_register:
1071 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 1073 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
1072 spin_unlock_irq(q->queue_lock); 1074 spin_unlock_irq(q->queue_lock);
1073 1075
1074 return 0; 1076 return err;
1075} 1077}
1076 1078
1077ssize_t elv_iosched_store(struct request_queue *q, const char *name, 1079/*
1078 size_t count) 1080 * Switch this queue to the given IO scheduler.
1081 */
1082int elevator_change(struct request_queue *q, const char *name)
1079{ 1083{
1080 char elevator_name[ELV_NAME_MAX]; 1084 char elevator_name[ELV_NAME_MAX];
1081 struct elevator_type *e; 1085 struct elevator_type *e;
1082 1086
1083 if (!q->elevator) 1087 if (!q->elevator)
1084 return count; 1088 return -ENXIO;
1085 1089
1086 strlcpy(elevator_name, name, sizeof(elevator_name)); 1090 strlcpy(elevator_name, name, sizeof(elevator_name));
1087 e = elevator_get(strstrip(elevator_name)); 1091 e = elevator_get(strstrip(elevator_name));
@@ -1092,13 +1096,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1092 1096
1093 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { 1097 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
1094 elevator_put(e); 1098 elevator_put(e);
1095 return count; 1099 return 0;
1096 } 1100 }
1097 1101
1098 if (!elevator_switch(q, e)) 1102 return elevator_switch(q, e);
1099 printk(KERN_ERR "elevator: switch to %s failed\n", 1103}
1100 elevator_name); 1104EXPORT_SYMBOL(elevator_change);
1101 return count; 1105
1106ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1107 size_t count)
1108{
1109 int ret;
1110
1111 if (!q->elevator)
1112 return count;
1113
1114 ret = elevator_change(q, name);
1115 if (!ret)
1116 return count;
1117
1118 printk(KERN_ERR "elevator: switch to %s failed\n", name);
1119 return ret;
1102} 1120}
1103 1121
1104ssize_t elv_iosched_show(struct request_queue *q, char *name) 1122ssize_t elv_iosched_show(struct request_queue *q, char *name)
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 31064df1370a..6124c2fd2d33 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -297,6 +297,8 @@ static void enqueue_cmd_and_start_io(ctlr_info_t *h,
297 spin_lock_irqsave(&h->lock, flags); 297 spin_lock_irqsave(&h->lock, flags);
298 addQ(&h->reqQ, c); 298 addQ(&h->reqQ, c);
299 h->Qdepth++; 299 h->Qdepth++;
300 if (h->Qdepth > h->maxQsinceinit)
301 h->maxQsinceinit = h->Qdepth;
300 start_io(h); 302 start_io(h);
301 spin_unlock_irqrestore(&h->lock, flags); 303 spin_unlock_irqrestore(&h->lock, flags);
302} 304}
@@ -4519,6 +4521,12 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4519 misc_fw_support = readl(&cfgtable->misc_fw_support); 4521 misc_fw_support = readl(&cfgtable->misc_fw_support);
4520 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET; 4522 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
4521 4523
4524 /* The doorbell reset seems to cause lockups on some Smart
4525 * Arrays (e.g. P410, P410i, maybe others). Until this is
4526 * fixed or at least isolated, avoid the doorbell reset.
4527 */
4528 use_doorbell = 0;
4529
4522 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); 4530 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
4523 if (rc) 4531 if (rc)
4524 goto unmap_cfgtable; 4532 goto unmap_cfgtable;
@@ -4712,6 +4720,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4712 h->scatter_list = kmalloc(h->max_commands * 4720 h->scatter_list = kmalloc(h->max_commands *
4713 sizeof(struct scatterlist *), 4721 sizeof(struct scatterlist *),
4714 GFP_KERNEL); 4722 GFP_KERNEL);
4723 if (!h->scatter_list)
4724 goto clean4;
4725
4715 for (k = 0; k < h->nr_cmds; k++) { 4726 for (k = 0; k < h->nr_cmds; k++) {
4716 h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) * 4727 h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
4717 h->maxsgentries, 4728 h->maxsgentries,
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f3c636d23718..91797bbbe702 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -477,7 +477,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
477 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; 477 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
478 478
479 if (bio_rw(bio) == WRITE) { 479 if (bio_rw(bio) == WRITE) {
480 bool barrier = (bio->bi_rw & REQ_HARDBARRIER); 480 bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
481 struct file *file = lo->lo_backing_file; 481 struct file *file = lo->lo_backing_file;
482 482
483 if (barrier) { 483 if (barrier) {
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index b82c5ce5e9df..76fa3deaee84 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -974,8 +974,7 @@ static int mg_probe(struct platform_device *plat_dev)
974 host->breq->queuedata = host; 974 host->breq->queuedata = host;
975 975
976 /* mflash is random device, thanx for the noop */ 976 /* mflash is random device, thanx for the noop */
977 elevator_exit(host->breq->elevator); 977 err = elevator_change(host->breq, "noop");
978 err = elevator_init(host->breq, "noop");
979 if (err) { 978 if (err) {
980 printk(KERN_ERR "%s:%d (elevator_init) fail\n", 979 printk(KERN_ERR "%s:%d (elevator_init) fail\n",
981 __func__, __LINE__); 980 __func__, __LINE__);
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index b7de02525ec9..85cf607fc78f 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -217,8 +217,7 @@ tapeblock_setup_device(struct tape_device * device)
217 if (!blkdat->request_queue) 217 if (!blkdat->request_queue)
218 return -ENOMEM; 218 return -ENOMEM;
219 219
220 elevator_exit(blkdat->request_queue->elevator); 220 rc = elevator_change(blkdat->request_queue, "noop");
221 rc = elevator_init(blkdat->request_queue, "noop");
222 if (rc) 221 if (rc)
223 goto cleanup_queue; 222 goto cleanup_queue;
224 223
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c1..4d0ff5ee27b8 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
413 413
414 /* Allocate kernel buffer for protection data */ 414 /* Allocate kernel buffer for protection data */
415 len = sectors * blk_integrity_tuple_size(bi); 415 len = sectors * blk_integrity_tuple_size(bi);
416 buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); 416 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
417 if (unlikely(buf == NULL)) { 417 if (unlikely(buf == NULL)) {
418 printk(KERN_ERR "could not allocate integrity buffer\n"); 418 printk(KERN_ERR "could not allocate integrity buffer\n");
419 return -EIO; 419 return -ENOMEM;
420 } 420 }
421 421
422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7d9d06ba184b..81e086d8aa57 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -808,7 +808,7 @@ int bdi_writeback_thread(void *data)
808 wb->last_active = jiffies; 808 wb->last_active = jiffies;
809 809
810 set_current_state(TASK_INTERRUPTIBLE); 810 set_current_state(TASK_INTERRUPTIBLE);
811 if (!list_empty(&bdi->work_list)) { 811 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
812 __set_current_state(TASK_RUNNING); 812 __set_current_state(TASK_RUNNING);
813 continue; 813 continue;
814 } 814 }
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2c958f4fce1e..926b50322a46 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -136,6 +136,7 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
136 136
137extern int elevator_init(struct request_queue *, char *); 137extern int elevator_init(struct request_queue *, char *);
138extern void elevator_exit(struct elevator_queue *); 138extern void elevator_exit(struct elevator_queue *);
139extern int elevator_change(struct request_queue *, const char *);
139extern int elv_rq_merge_ok(struct request *, struct bio *); 140extern int elv_rq_merge_ok(struct request *, struct bio *);
140 141
141/* 142/*
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index a5ec42868f99..4ceb05d772ae 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -248,8 +248,18 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
248 left -= sg_size; 248 left -= sg_size;
249 249
250 sg = alloc_fn(alloc_size, gfp_mask); 250 sg = alloc_fn(alloc_size, gfp_mask);
251 if (unlikely(!sg)) 251 if (unlikely(!sg)) {
252 return -ENOMEM; 252 /*
253 * Adjust entry count to reflect that the last
254 * entry of the previous table won't be used for
255 * linkage. Without this, sg_kfree() may get
256 * confused.
257 */
258 if (prv)
259 table->nents = ++table->orig_nents;
260
261 return -ENOMEM;
262 }
253 263
254 sg_init_table(sg, alloc_size); 264 sg_init_table(sg, alloc_size);
255 table->nents = table->orig_nents += sg_size; 265 table->nents = table->orig_nents += sg_size;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index eaa4a5bbe063..c2bf86f470ed 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -445,8 +445,8 @@ static int bdi_forker_thread(void *ptr)
445 switch (action) { 445 switch (action) {
446 case FORK_THREAD: 446 case FORK_THREAD:
447 __set_current_state(TASK_RUNNING); 447 __set_current_state(TASK_RUNNING);
448 task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", 448 task = kthread_create(bdi_writeback_thread, &bdi->wb,
449 dev_name(bdi->dev)); 449 "flush-%s", dev_name(bdi->dev));
450 if (IS_ERR(task)) { 450 if (IS_ERR(task)) {
451 /* 451 /*
452 * If thread creation fails, force writeout of 452 * If thread creation fails, force writeout of
@@ -457,10 +457,13 @@ static int bdi_forker_thread(void *ptr)
457 /* 457 /*
458 * The spinlock makes sure we do not lose 458 * The spinlock makes sure we do not lose
459 * wake-ups when racing with 'bdi_queue_work()'. 459 * wake-ups when racing with 'bdi_queue_work()'.
460 * And as soon as the bdi thread is visible, we
461 * can start it.
460 */ 462 */
461 spin_lock_bh(&bdi->wb_lock); 463 spin_lock_bh(&bdi->wb_lock);
462 bdi->wb.task = task; 464 bdi->wb.task = task;
463 spin_unlock_bh(&bdi->wb_lock); 465 spin_unlock_bh(&bdi->wb_lock);
466 wake_up_process(task);
464 } 467 }
465 break; 468 break;
466 469