aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2013-02-21 19:42:55 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-21 20:22:20 -0500
commitffecfd1a72fccfcee3dabb99b9ecba9735318f90 (patch)
treea5c3caf67249ec811a2c2c95678d9349fd8e2412
parent13575ca14fcdacd1ad914d00bc63eb4d96280986 (diff)
block: optionally snapshot page contents to provide stable pages during write
This provides a band-aid to provide stable page writes on jbd without needing to backport the fixed locking and page writeback bit handling schemes of jbd2. The band-aid works by using bounce buffers to snapshot page contents instead of waiting. For those wondering about the ext3 bandage -- fixing the jbd locking (which was done as part of ext4dev years ago) is a lot of surgery, and setting PG_writeback on data pages when we actually hold the page lock dropped ext3 performance by nearly an order of magnitude. If we're going to migrate iscsi and raid to use stable page writes, the complaints about high latency will likely return. We might as well centralize their page snapshotting thing to one place. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Tested-by: Andy Lutomirski <luto@amacapital.net> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Artem Bityutskiy <dedekind1@gmail.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Joel Becker <jlbec@evilplan.org> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Eric Van Hensbergen <ericvh@gmail.com> Cc: Ron Minnich <rminnich@sandia.gov> Cc: Latchesar Ionkov <lucho@ionkov.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/tile/Kconfig6
-rw-r--r--block/blk-core.c8
-rw-r--r--fs/ext3/super.c1
-rw-r--r--include/uapi/linux/fs.h3
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/bounce.c48
-rw-r--r--mm/page-writeback.c4
7 files changed, 70 insertions, 13 deletions
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 1bb7ad4aeff4..b1e68f52029c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -412,12 +412,6 @@ config TILE_USB
412 Provides USB host adapter support for the built-in EHCI and OHCI 412 Provides USB host adapter support for the built-in EHCI and OHCI
413 interfaces on TILE-Gx chips. 413 interfaces on TILE-Gx chips.
414 414
415# USB OHCI needs the bounce pool since tilegx will often have more
416# than 4GB of memory, but we don't currently use the IOTLB to present
417# a 32-bit address to OHCI. So we need to use a bounce pool instead.
418config NEED_BOUNCE_POOL
419 def_bool USB_OHCI_HCD
420
421source "drivers/pci/hotplug/Kconfig" 415source "drivers/pci/hotplug/Kconfig"
422 416
423endmenu 417endmenu
diff --git a/block/blk-core.c b/block/blk-core.c
index c973249d68cd..277134cb5d32 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1474,6 +1474,11 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1474 */ 1474 */
1475 blk_queue_bounce(q, &bio); 1475 blk_queue_bounce(q, &bio);
1476 1476
1477 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1478 bio_endio(bio, -EIO);
1479 return;
1480 }
1481
1477 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1482 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1478 spin_lock_irq(q->queue_lock); 1483 spin_lock_irq(q->queue_lock);
1479 where = ELEVATOR_INSERT_FLUSH; 1484 where = ELEVATOR_INSERT_FLUSH;
@@ -1714,9 +1719,6 @@ generic_make_request_checks(struct bio *bio)
1714 */ 1719 */
1715 blk_partition_remap(bio); 1720 blk_partition_remap(bio);
1716 1721
1717 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
1718 goto end_io;
1719
1720 if (bio_check_eod(bio, nr_sectors)) 1722 if (bio_check_eod(bio, nr_sectors))
1721 goto end_io; 1723 goto end_io;
1722 1724
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e50223b3299..4ba2683c1d44 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2065,6 +2065,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2065 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": 2065 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
2066 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2066 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2067 "writeback"); 2067 "writeback");
2068 sb->s_flags |= MS_SNAP_STABLE;
2068 2069
2069 return 0; 2070 return 0;
2070 2071
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 780d4c6093eb..c7fc1e6517c3 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -86,6 +86,9 @@ struct inodes_stat_t {
86#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ 86#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
87#define MS_I_VERSION (1<<23) /* Update inode I_version field */ 87#define MS_I_VERSION (1<<23) /* Update inode I_version field */
88#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ 88#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
89
90/* These sb flags are internal to the kernel */
91#define MS_SNAP_STABLE (1<<27) /* Snapshot pages during writeback, if needed */
89#define MS_NOSEC (1<<28) 92#define MS_NOSEC (1<<28)
90#define MS_BORN (1<<29) 93#define MS_BORN (1<<29)
91#define MS_ACTIVE (1<<30) 94#define MS_ACTIVE (1<<30)
diff --git a/mm/Kconfig b/mm/Kconfig
index 278e3ab1f169..7901d839aab2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -258,6 +258,19 @@ config BOUNCE
258 def_bool y 258 def_bool y
259 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) 259 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
260 260
261# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
262# have more than 4GB of memory, but we don't currently use the IOTLB to present
263# a 32-bit address to OHCI. So we need to use a bounce pool instead.
264#
265# We also use the bounce pool to provide stable page writes for jbd. jbd
266# initiates buffer writeback without locking the page or setting PG_writeback,
267# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
268# a major rework effort. Instead, use the bounce buffer to snapshot pages
269# (until jbd goes away). The only jbd user is ext3.
270config NEED_BOUNCE_POOL
271 bool
272 default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
273
261config NR_QUICK 274config NR_QUICK
262 int 275 int
263 depends on QUICKLIST 276 depends on QUICKLIST
diff --git a/mm/bounce.c b/mm/bounce.c
index 042086775561..5f8901768602 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -178,8 +178,45 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
178 __bounce_end_io_read(bio, isa_page_pool, err); 178 __bounce_end_io_read(bio, isa_page_pool, err);
179} 179}
180 180
181#ifdef CONFIG_NEED_BOUNCE_POOL
182static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
183{
184 struct page *page;
185 struct backing_dev_info *bdi;
186 struct address_space *mapping;
187 struct bio_vec *from;
188 int i;
189
190 if (bio_data_dir(bio) != WRITE)
191 return 0;
192
193 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
194 return 0;
195
196 /*
197 * Based on the first page that has a valid mapping, decide whether or
198 * not we have to employ bounce buffering to guarantee stable pages.
199 */
200 bio_for_each_segment(from, bio, i) {
201 page = from->bv_page;
202 mapping = page_mapping(page);
203 if (!mapping)
204 continue;
205 bdi = mapping->backing_dev_info;
206 return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
207 }
208
209 return 0;
210}
211#else
212static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
213{
214 return 0;
215}
216#endif /* CONFIG_NEED_BOUNCE_POOL */
217
181static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 218static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
182 mempool_t *pool) 219 mempool_t *pool, int force)
183{ 220{
184 struct page *page; 221 struct page *page;
185 struct bio *bio = NULL; 222 struct bio *bio = NULL;
@@ -192,7 +229,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
192 /* 229 /*
193 * is destination page below bounce pfn? 230 * is destination page below bounce pfn?
194 */ 231 */
195 if (page_to_pfn(page) <= queue_bounce_pfn(q)) 232 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
196 continue; 233 continue;
197 234
198 /* 235 /*
@@ -270,6 +307,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
270 307
271void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) 308void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
272{ 309{
310 int must_bounce;
273 mempool_t *pool; 311 mempool_t *pool;
274 312
275 /* 313 /*
@@ -278,13 +316,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
278 if (!bio_has_data(*bio_orig)) 316 if (!bio_has_data(*bio_orig))
279 return; 317 return;
280 318
319 must_bounce = must_snapshot_stable_pages(q, *bio_orig);
320
281 /* 321 /*
282 * for non-isa bounce case, just check if the bounce pfn is equal 322 * for non-isa bounce case, just check if the bounce pfn is equal
283 * to or bigger than the highest pfn in the system -- in that case, 323 * to or bigger than the highest pfn in the system -- in that case,
284 * don't waste time iterating over bio segments 324 * don't waste time iterating over bio segments
285 */ 325 */
286 if (!(q->bounce_gfp & GFP_DMA)) { 326 if (!(q->bounce_gfp & GFP_DMA)) {
287 if (queue_bounce_pfn(q) >= blk_max_pfn) 327 if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
288 return; 328 return;
289 pool = page_pool; 329 pool = page_pool;
290 } else { 330 } else {
@@ -295,7 +335,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
295 /* 335 /*
296 * slow path 336 * slow path
297 */ 337 */
298 __blk_queue_bounce(q, bio_orig, pool); 338 __blk_queue_bounce(q, bio_orig, pool, must_bounce);
299} 339}
300 340
301EXPORT_SYMBOL(blk_queue_bounce); 341EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 355d5ee69058..7300c9d5e1d9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2306,6 +2306,10 @@ void wait_for_stable_page(struct page *page)
2306 2306
2307 if (!bdi_cap_stable_pages_required(bdi)) 2307 if (!bdi_cap_stable_pages_required(bdi))
2308 return; 2308 return;
2309#ifdef CONFIG_NEED_BOUNCE_POOL
2310 if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
2311 return;
2312#endif /* CONFIG_NEED_BOUNCE_POOL */
2309 2313
2310 wait_on_page_writeback(page); 2314 wait_on_page_writeback(page);
2311} 2315}