litmus-rt.git - The LITMUS^RT kernel.

diff options

author	David S. Miller <davem@sunset.davemloft.net>	2007-05-30 22:01:47 -0400
committer	David S. Miller <davem@sunset.davemloft.net>	2007-05-31 04:52:48 -0400
commit	dbbe3cb8cff6b494ac2cba6a94dc7aabe7e5b635 (patch)
tree	38832dcc3083891ef65d794d34343d71b67c7984
parent	3f0a6766e0cc5a577805732e5adb50a585c58175 (diff)

[SPARC64]: Add missing NCS and SVC hypervisor interfaces.

Signed-off-by: David S. Miller <davem@davemloft.net>

Diffstat

-rw-r--r--

arch/sparc64/kernel/entry.S

-rw-r--r--

include/asm-sparc64/hypervisor.h

168

2 files changed, 240 insertions, 0 deletions


diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index 8f10dda0f5c0..ed712e0b3372 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S
@@ -2498,3 +2498,75 @@ sun4v_vintr_set_target:
2498	retl	2498	retl
2499	nop	2499	nop
2500	.size sun4v_vintr_set_target, .-sun4v_vintr_set_target	2500	.size sun4v_vintr_set_target, .-sun4v_vintr_set_target
		2501
		2502	/* %o0: NCS sub-function
		2503	* %o1: sub-function arg real-address
		2504	* %o2: sub-function arg size
		2505	*
		2506	* returns %o0: status
		2507	*/
		2508	.globl sun4v_ncs_request
		2509	.type sun4v_ncs_request,#function
		2510	sun4v_ncs_request:
		2511	mov HV_FAST_NCS_REQUEST, %o5
		2512	ta HV_FAST_TRAP
		2513	retl
		2514	nop
		2515	.size sun4v_ncs_request, .-sun4v_ncs_request
		2516
		2517	.globl sun4v_scv_send
		2518	.type sun4v_scv_send,#function
		2519	sun4v_scv_send:
		2520	save %sp, -192, %sp
		2521	mov %i0, %o0
		2522	mov %i1, %o1
		2523	mov %i2, %o2
		2524	mov HV_FAST_SVC_SEND, %o5
		2525	ta HV_FAST_TRAP
		2526	stx %o1, [%i3]
		2527	ret
		2528	restore
		2529	.size sun4v_scv_send, .-sun4v_scv_send
		2530
		2531	.globl sun4v_scv_recv
		2532	.type sun4v_scv_recv,#function
		2533	sun4v_scv_recv:
		2534	save %sp, -192, %sp
		2535	mov %i0, %o0
		2536	mov %i1, %o1
		2537	mov %i2, %o2
		2538	mov HV_FAST_SVC_RECV, %o5
		2539	ta HV_FAST_TRAP
		2540	stx %o1, [%i3]
		2541	ret
		2542	restore
		2543	.size sun4v_scv_recv, .-sun4v_scv_recv
		2544
		2545	.globl sun4v_scv_getstatus
		2546	.type sun4v_scv_getstatus,#function
		2547	sun4v_scv_getstatus:
		2548	mov HV_FAST_SVC_GETSTATUS, %o5
		2549	mov %o1, %o4
		2550	ta HV_FAST_TRAP
		2551	stx %o1, [%o4]
		2552	retl
		2553	nop
		2554	.size sun4v_scv_getstatus, .-sun4v_scv_getstatus
		2555
		2556	.globl sun4v_scv_setstatus
		2557	.type sun4v_scv_setstatus,#function
		2558	sun4v_scv_setstatus:
		2559	mov HV_FAST_SVC_SETSTATUS, %o5
		2560	ta HV_FAST_TRAP
		2561	retl
		2562	nop
		2563	.size sun4v_scv_setstatus, .-sun4v_scv_setstatus
		2564
		2565	.globl sun4v_scv_clrstatus
		2566	.type sun4v_scv_clrstatus,#function
		2567	sun4v_scv_clrstatus:
		2568	mov HV_FAST_SVC_CLRSTATUS, %o5
		2569	ta HV_FAST_TRAP
		2570	retl
		2571	nop
		2572	.size sun4v_scv_clrstatus, .-sun4v_scv_clrstatus


diff --git a/include/asm-sparc64/hypervisor.h b/include/asm-sparc64/hypervisor.h index 5cdb1ff04838..4a43075a0619 100644 --- a/include/asm-sparc64/hypervisor.h +++ b/include/asm-sparc64/hypervisor.h
@@ -1097,6 +1097,80 @@ extern unsigned long sun4v_mach_set_soft_state(unsigned long soft_state,
1097	*/	1097	*/
1098	#define HV_FAST_MACH_GET_SOFT_STATE 0x71	1098	#define HV_FAST_MACH_GET_SOFT_STATE 0x71
1099		1099
		1100	/* svc_send()
		1101	* TRAP: HV_FAST_TRAP
		1102	* FUNCTION: HV_FAST_SVC_SEND
		1103	* ARG0: service ID
		1104	* ARG1: buffer real address
		1105	* ARG2: buffer size
		1106	* RET0: STATUS
		1107	* RET1: sent_bytes
		1108	*
		1109	* Be careful, all output registers are clobbered by this operation,
		1110	* so for example it is not possible to save away a value in %o4
		1111	* across the trap.
		1112	*/
		1113	#define HV_FAST_SVC_SEND 0x80
		1114
		1115	/* svc_recv()
		1116	* TRAP: HV_FAST_TRAP
		1117	* FUNCTION: HV_FAST_SVC_RECV
		1118	* ARG0: service ID
		1119	* ARG1: buffer real address
		1120	* ARG2: buffer size
		1121	* RET0: STATUS
		1122	* RET1: recv_bytes
		1123	*
		1124	* Be careful, all output registers are clobbered by this operation,
		1125	* so for example it is not possible to save away a value in %o4
		1126	* across the trap.
		1127	*/
		1128	#define HV_FAST_SVC_RECV 0x81
		1129
		1130	/* svc_getstatus()
		1131	* TRAP: HV_FAST_TRAP
		1132	* FUNCTION: HV_FAST_SVC_GETSTATUS
		1133	* ARG0: service ID
		1134	* RET0: STATUS
		1135	* RET1: status bits
		1136	*/
		1137	#define HV_FAST_SVC_GETSTATUS 0x82
		1138
		1139	/* svc_setstatus()
		1140	* TRAP: HV_FAST_TRAP
		1141	* FUNCTION: HV_FAST_SVC_SETSTATUS
		1142	* ARG0: service ID
		1143	* ARG1: bits to set
		1144	* RET0: STATUS
		1145	*/
		1146	#define HV_FAST_SVC_SETSTATUS 0x83
		1147
		1148	/* svc_clrstatus()
		1149	* TRAP: HV_FAST_TRAP
		1150	* FUNCTION: HV_FAST_SVC_CLRSTATUS
		1151	* ARG0: service ID
		1152	* ARG1: bits to clear
		1153	* RET0: STATUS
		1154	*/
		1155	#define HV_FAST_SVC_CLRSTATUS 0x84
		1156
		1157	#ifndef __ASSEMBLY__
		1158	extern unsigned long sun4v_svc_send(unsigned long svc_id,
		1159	unsigned long buffer,
		1160	unsigned long buffer_size,
		1161	unsigned long *sent_bytes);
		1162	extern unsigned long sun4v_svc_recv(unsigned long svc_id,
		1163	unsigned long buffer,
		1164	unsigned long buffer_size,
		1165	unsigned long *recv_bytes);
		1166	extern unsigned long sun4v_svc_getstatus(unsigned long svc_id,
		1167	unsigned long *status_bits);
		1168	extern unsigned long sun4v_svc_setstatus(unsigned long svc_id,
		1169	unsigned long status_bits);
		1170	extern unsigned long sun4v_svc_clrstatus(unsigned long svc_id,
		1171	unsigned long status_bits);
		1172	#endif
		1173
1100	/* Trap trace services.	1174	/* Trap trace services.
1101	*	1175	*
1102	* The hypervisor provides a trap tracing capability for privileged	1176	* The hypervisor provides a trap tracing capability for privileged
@@ -2724,6 +2798,100 @@ struct hv_mmu_statistics {
2724	*/	2798	*/
2725	#define HV_FAST_MMUSTAT_INFO 0x103	2799	#define HV_FAST_MMUSTAT_INFO 0x103
2726		2800
		2801	/* NCS crypto services */
		2802
		2803	/* ncs_request() sub-function numbers */
		2804	#define HV_NCS_QCONF 0x01
		2805	#define HV_NCS_QTAIL_UPDATE 0x02
		2806
		2807	#ifndef __ASSEMBLY__
		2808	struct hv_ncs_queue_entry {
		2809	/* MAU Control Register */
		2810	unsigned long mau_control;
		2811	#define MAU_CONTROL_INV_PARITY 0x0000000000002000
		2812	#define MAU_CONTROL_STRAND 0x0000000000001800
		2813	#define MAU_CONTROL_BUSY 0x0000000000000400
		2814	#define MAU_CONTROL_INT 0x0000000000000200
		2815	#define MAU_CONTROL_OP 0x00000000000001c0
		2816	#define MAU_CONTROL_OP_SHIFT 6
		2817	#define MAU_OP_LOAD_MA_MEMORY 0x0
		2818	#define MAU_OP_STORE_MA_MEMORY 0x1
		2819	#define MAU_OP_MODULAR_MULT 0x2
		2820	#define MAU_OP_MODULAR_REDUCE 0x3
		2821	#define MAU_OP_MODULAR_EXP_LOOP 0x4
		2822	#define MAU_CONTROL_LEN 0x000000000000003f
		2823	#define MAU_CONTROL_LEN_SHIFT 0
		2824
		2825	/* Real address of bytes to load or store bytes
		2826	* into/out-of the MAU.
		2827	*/
		2828	unsigned long mau_mpa;
		2829
		2830	/* Modular Arithmetic MA Offset Register. */
		2831	unsigned long mau_ma;
		2832
		2833	/* Modular Arithmetic N Prime Register. */
		2834	unsigned long mau_np;
		2835	};
		2836
		2837	struct hv_ncs_qconf_arg {
		2838	unsigned long mid; /* MAU ID, 1 per core on Niagara */
		2839	unsigned long base; /* Real address base of queue */
		2840	unsigned long end; /* Real address end of queue */
		2841	unsigned long num_ents; /* Number of entries in queue */
		2842	};
		2843
		2844	struct hv_ncs_qtail_update_arg {
		2845	unsigned long mid; /* MAU ID, 1 per core on Niagara */
		2846	unsigned long tail; /* New tail index to use */
		2847	unsigned long syncflag; /* only SYNCFLAG_SYNC is implemented */
		2848	#define HV_NCS_SYNCFLAG_SYNC 0x00
		2849	#define HV_NCS_SYNCFLAG_ASYNC 0x01
		2850	};
		2851	#endif
		2852
		2853	/* ncs_request()
		2854	* TRAP: HV_FAST_TRAP
		2855	* FUNCTION: HV_FAST_NCS_REQUEST
		2856	* ARG0: NCS sub-function
		2857	* ARG1: sub-function argument real address
		2858	* ARG2: size in bytes of sub-function argument
		2859	* RET0: status
		2860	*
		2861	* The MAU chip of the Niagara processor is not directly accessible
		2862	* to privileged code, instead it is programmed indirectly via this
		2863	* hypervisor API.
		2864	*
		2865	* The interfaces defines a queue of MAU operations to perform.
		2866	* Privileged code registers a queue with the hypervisor by invoking
		2867	* this HVAPI with the HV_NCS_QCONF sub-function, which defines the
		2868	* base, end, and number of entries of the queue. Each queue entry
		2869	* contains a MAU register struct block.
		2870	*
		2871	* The privileged code then proceeds to add entries to the queue and
		2872	* then invoke the HV_NCS_QTAIL_UPDATE sub-function. Since only
		2873	* synchronous operations are supported by the current hypervisor,
		2874	* HV_NCS_QTAIL_UPDATE will run all the pending queue entries to
		2875	* completion and return HV_EOK, or return an error code.
		2876	*
		2877	* The real address of the sub-function argument must be aligned on at
		2878	* least an 8-byte boundary.
		2879	*
		2880	* The tail argument of HV_NCS_QTAIL_UPDATE is an index, not a byte
		2881	* offset, into the queue and must be less than or equal the 'num_ents'
		2882	* argument given in the HV_NCS_QCONF call.
		2883	*/
		2884	#define HV_FAST_NCS_REQUEST 0x110
		2885
		2886	#ifndef __ASSEMBLY__
		2887	extern unsigned long sun4v_ncs_request(unsigned long request,
		2888	unsigned long arg_ra,
		2889	unsigned long arg_size);
		2890	#endif
		2891
		2892	#define HV_FAST_FIRE_GET_PERFREG 0x120
		2893	#define HV_FAST_FIRE_SET_PERFREG 0x121
		2894
2727	/* Function numbers for HV_CORE_TRAP. */	2895	/* Function numbers for HV_CORE_TRAP. */
2728	#define HV_CORE_SET_VER 0x00	2896	#define HV_CORE_SET_VER 0x00
2729	#define HV_CORE_PUTCHAR 0x01	2897	#define HV_CORE_PUTCHAR 0x01

/* * linux/fs/buffer.c * * Copyright (C) 1991, 1992, 2002 Linus Torvalds */ /* * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 * * Removed a lot of unnecessary code and simplified things now that * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 * * Speed up hash, lru, and free list operations. Use gfp() for allocating * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM * * Added 32k buffer block sizes - these are required older ARM systems. - RMK * * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */ #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/smp_lock.h> #include <linux/capability.h> #include <linux/blkdev.h> #include <linux/file.h> #include <linux/quotaops.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> #include <linux/buffer_head.h> #include <linux/task_io_accounting_ops.h> #include <linux/bio.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void invalidate_bh_lrus(void); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) inline void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) { bh->b_end_io = handler; bh->b_private = private; } static int sync_buffer(void *word) { struct block_device *bd; struct buffer_head *bh = container_of(word, struct buffer_head, b_state); smp_mb(); bd = bh->b_bdev; if (bd) blk_run_address_space(bd->bd_inode->i_mapping); io_schedule(); return 0; } void fastcall __lock_buffer(struct buffer_head *bh) { wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); void fastcall unlock_buffer(struct buffer_head *bh) { smp_mb__before_clear_bit(); clear_buffer_locked(bh); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } /* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state. */ void __wait_on_buffer(struct buffer_head * bh) { wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); } static void __clear_page_buffers(struct page *page) { ClearPagePrivate(page); set_page_private(page, 0); page_cache_release(page); } static void buffer_io_error(struct buffer_head *bh) { char b[BDEVNAME_SIZE]; printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* * Default synchronous end-of-IO handler.. Just mark it up-to-date and * unlock the buffer. This is what ll_rw_block uses too. */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); } else { /* This happens, due to failed READA attempts. */ clear_buffer_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; if (uptodate) { set_buffer_uptodate(bh); } else { if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { buffer_io_error(bh); printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", bdevname(bh->b_bdev, b)); } set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { int ret = 0; if (bdev) ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); return ret; } EXPORT_SYMBOL(sync_blockdev); /* * Write out and wait upon all dirty data associated with this * device. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ int fsync_bdev(struct block_device *bdev) { struct super_block *sb = get_super(bdev); if (sb) { int res = fsync_super(sb); drop_super(sb); return res; } return sync_blockdev(bdev); } /** * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * This takes the block device bd_mount_sem to make sure no new mounts * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. */ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; down(&bdev->bd_mount_sem); sb = get_super(bdev); if (sb && !(sb->s_flags & MS_RDONLY)) { sb->s_frozen = SB_FREEZE_WRITE; smp_wmb(); __fsync_super(sb); sb->s_frozen = SB_FREEZE_TRANS; smp_wmb(); sync_blockdev(sb->s_bdev); if (sb->s_op->write_super_lockfs) sb->s_op->write_super_lockfs(sb); } sync_blockdev(bdev); return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ void thaw_bdev(struct block_device *bdev, struct super_block *sb) { if (sb) { BUG_ON(sb->s_bdev != bdev); if (sb->s_op->unlockfs) sb->s_op->unlockfs(sb); sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); drop_super(sb); } up(&bdev->bd_mount_sem); } EXPORT_SYMBOL(thaw_bdev); /* * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * * Hack idea: for the blockdev mapping, i_bufferlist_lock contention * may be quite high. This code could TryLock the page, and if that * succeeds, there is no need to take private_lock. (But if * private_lock is contended then so is mapping->tree_lock). */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) { struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; struct buffer_head *ret = NULL; pgoff_t index; struct buffer_head *bh; struct buffer_head *head; struct page *page; int all_mapped = 1; index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); page = find_get_page(bd_mapping, index); if (!page) goto out; spin_lock(&bd_mapping->private_lock); if (!page_has_buffers(page)) goto out_unlock; head = page_buffers(page); bh = head; do { if (bh->b_blocknr == block) { ret = bh; get_bh(bh); goto out_unlock; } if (!buffer_mapped(bh)) all_mapped = 0; bh = bh->b_this_page; } while (bh != head); /* we might be here because some of the buffers on this page are * not mapped. This is due to various races between * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); page_cache_release(page); out: return ret; } /* If invalidate_buffers() will trash dirty buffers, it means some kind of fs corruption is going on. Trashing dirty data always imply losing information that was supposed to be just stored on the physical layer by the user. Thus invalidate_buffers in general usage is not allwowed to trash dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. These buffers are simply skipped. We also skip buffers which are still in use. For example this can happen if a userspace program is reading the block device. NOTE: In the case where the user removed a removable-media-disk even if there's still dirty data not synced on disk (due a bug in the device driver or due an error of the user), by not destroying the dirty buffers we could generate corruption also on the next media inserted, thus a parameter is necessary to handle this case in the most safe way possible (trying to not corrupt also the new disk inserted with the data belonging to the old now corrupted disk). Also for the ramdisk the natural thing to do in order to release the ramdisk memory is to destroy dirty buffers. These are two special cases. Normal usage imply the device driver to issue a sync on the device (without waiting I/O completion) and then an invalidate_buffers call that doesn't trash dirty buffers. For handling cache coherency with the blkdev pagecache the 'update' case is been introduced. It is needed to re-read from disk any pinned buffer. NOTE: re-reading from disk is destructive so we can do it only when we assume nobody is changing the buffercache under our I/O and when we think the disk contains more recent information than the buffercache. The update == 1 pass marks the buffers we need to update, the update == 2 pass does the actual I/O. */ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping->nrpages == 0) return; invalidate_bh_lrus(); /* * FIXME: what about destroy_dirty_buffers? * We really want to use invalidate_inode_pages2() for * that, but not until that's cleaned up. */ invalidate_mapping_pages(mapping, 0, -1); } /* * Kick pdflush then try to free up some ZONE_NORMAL memory. */ static void free_more_memory(void) { struct zone **zones; pg_data_t *pgdat; wakeup_pdflush(1024); yield(); for_each_online_pgdat(pgdat) { zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; if (*zones) try_to_free_pages(zones, GFP_NOFS); } } /* * I/O completion handler for block_read_full_page() - pages * which come unlocked at the end of I/O. */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; struct page *page; int page_uptodate = 1; BUG_ON(!buffer_async_read(bh)); page = bh->b_page; if (uptodate) { set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); if (printk_ratelimit()) buffer_io_error(bh); SetPageError(page); } /* * Be _very_ careful from here on. Bad things can happen if * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ first = page_buffers(page); local_irq_save(flags); bit_spin_lock(BH_Uptodate_Lock, &first->b_state); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; do { if (!buffer_uptodate(tmp)) page_uptodate = 0; if (buffer_async_read(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; } tmp = tmp->b_this_page; } while (tmp != bh); bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); local_irq_restore(flags); /* * If none of the buffers had errors and they are all * uptodate then we can set the page uptodate. */ if (page_uptodate && !PageError(page)) SetPageUptodate(page); unlock_page(page); return; still_busy: bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); local_irq_restore(flags); return; } /* * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; struct page *page; BUG_ON(!buffer_async_write(bh)); page = bh->b_page; if (uptodate) { set_buffer_uptodate(bh); } else { if (printk_ratelimit()) { buffer_io_error(bh); printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", bdevname(bh->b_bdev, b)); } set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); } first = page_buffers(page); local_irq_save(flags); bit_spin_lock(BH_Uptodate_Lock, &first->b_state); clear_buffer_async_write(bh); unlock_buffer(bh); tmp = bh->b_this_page; while (tmp != bh) { if (buffer_async_write(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; } tmp = tmp->b_this_page; } bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); local_irq_restore(flags); end_page_writeback(page); return; still_busy: bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); local_irq_restore(flags); return; } /* * If a page's buffers are under async readin (end_buffer_async_read * completion) then there is a possibility that another thread of * control could lock one of the buffers after it has completed * but while some of the other buffers have not completed. This * locked buffer would confuse end_buffer_async_read() into not unlocking * the page. So the absence of BH_Async_Read tells end_buffer_async_read() * that this buffer is not under async I/O. * * The page comes unlocked when it has no locked buffer_async buffers * left. * * PageLocked prevents anyone starting new async I/O reads any of * the buffers. * * PageWriteback is used to prevent simultaneous writeout of the same * page. * * PageLocked prevents anyone from starting writeback of a page which is * under read I/O (PageWriteback is only ever set against a locked page). */ static void mark_buffer_async_read(struct buffer_head *bh) { bh->b_end_io = end_buffer_async_read; set_buffer_async_read(bh); } void mark_buffer_async_write(struct buffer_head *bh) { bh->b_end_io = end_buffer_async_write; set_buffer_async_write(bh); } EXPORT_SYMBOL(mark_buffer_async_write); /* * fs/buffer.c contains helper functions for buffer-backed address space's * fsync functions. A common requirement for buffer-based filesystems is * that certain data from the backing blockdev needs to be written out for * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. * So the locking for private_list is via the private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, * mapping->private_lock does *not* protect mapping->private_list! In fact, * mapping->private_list will always be protected by the backing blockdev's * ->private_lock. * * Which introduces a requirement: all buffers on an address_space's * ->private_list must be from the same address_space: the blockdev's. * * address_spaces which do not place buffers at ->private_list via these * utility functions are free to use private_lock and private_list for * whatever they want. The only requirement is that list_empty(private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The * filesystems should do that. invalidate_inode_buffers() should just go * BUG_ON(!list_empty). * * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should * take an address_space, not an inode. And it should be called * mark_buffer_dirty_fsync() to clearly define why those buffers are being * queued up. * * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the * list if it is already on a list. Because if the buffer is on a list, * it *must* already be on the right one. If not, the filesystem is being * silly. This will save a ton of locking. But first we have to ensure * that buffers are taken *off* the old inode's list when they are freed * (presumably in truncate). That requires careful auditing of all * filesystems (do it inside bforget()). It could also be done by bringing * b_inode back. */ /* * The buffer's backing address_space's private_lock must be held */ static inline void __remove_assoc_queue(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); if (buffer_write_io_error(bh)) set_bit(AS_EIO, &bh->b_assoc_map->flags); bh->b_assoc_map = NULL; } int inode_has_buffers(struct inode *inode) { return !list_empty(&inode->i_data.private_list); } /* * osync is designed to support O_SYNC io. It waits synchronously for * all already-submitted IO to complete, but does not queue any new * writes to the disk. * * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as * you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */ static int osync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct list_head *p; int err = 0; spin_lock(lock); repeat: list_for_each_prev(p, list) { bh = BH_ENTRY(p); if (buffer_locked(bh)) { get_bh(bh); spin_unlock(lock); wait_on_buffer(bh); if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); spin_lock(lock); goto repeat; } } spin_unlock(lock); return err; } /** * sync_mapping_buffers - write out and wait upon a mapping's "associated" * buffers * @mapping: the mapping which wants those buffers written * * Starts I/O against the buffers at mapping->private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). * @mapping is a file or directory which needs those buffers to be written for * a successful fsync(). */ int sync_mapping_buffers(struct address_space *mapping) { struct address_space *buffer_mapping = mapping->assoc_mapping; if (buffer_mapping == NULL || list_empty(&mapping->private_list)) return 0; return fsync_buffers_list(&buffer_mapping->private_lock, &mapping->private_list); } EXPORT_SYMBOL(sync_mapping_buffers); /* * Called when we've recently written block `bblock', and it is known that * `bblock' was for a buffer_boundary() buffer. This means that the block at * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's * dirty, schedule it for IO. So that indirects merge nicely with their data. */ void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize) { struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) ll_rw_block(WRITE, 1, &bh); put_bh(bh); } } void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; struct address_space *buffer_mapping = bh->b_page->mapping; mark_buffer_dirty(bh); if (!mapping->assoc_mapping) { mapping->assoc_mapping = buffer_mapping; } else { BUG_ON(mapping->assoc_mapping != buffer_mapping); } if (list_empty(&bh->b_assoc_buffers)) { spin_lock(&buffer_mapping->private_lock); list_move_tail(&bh->b_assoc_buffers, &mapping->private_list); bh->b_assoc_map = mapping; spin_unlock(&buffer_mapping->private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); /* * Add a page to the dirty page list. * * It is a sad fact of life that this function is called from several places * deeply under spinlocking. It may not sleep. * * If the page has buffers, the uptodate buffers are set dirty, to preserve * dirty-state coherency between the page and the buffers. It the page does * not have buffers then when they are later attached they will all be set * dirty. * * The buffers are dirtied before the page is dirtied. There's a small race * window in which a writepage caller may see the page cleanness but not the * buffer dirtiness. That's fine. If this code were to set the page dirty * before the buffers, a concurrent writepage caller could clear the page dirty * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * * We use private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. */ int __set_page_dirty_buffers(struct page *page) { struct address_space * const mapping = page_mapping(page); if (unlikely(!mapping)) return !TestSetPageDirty(page); spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; do { set_buffer_dirty(bh); bh = bh->b_this_page; } while (bh != head); } spin_unlock(&mapping->private_lock); if (TestSetPageDirty(page)) return 0; write_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } write_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; } EXPORT_SYMBOL(__set_page_dirty_buffers); /* * Write out and wait upon a list of buffers. * * We have conflicting pressures: we want to make sure that all * initially dirty buffers get waited on, but that any subsequently * dirtied buffers don't. After all, we don't want fsync to last * forever if somebody is actively writing to the file. * * Do this in two main stages: first we copy dirty buffers to a * temporary inode list, queueing the writes as we go. Then we clean * up, waiting for those writes to complete. * * During this second stage, any subsequent updates to the file may end * up refiling the buffer on the original inode's dirty list again, so * there is a chance we will end up with a buffer queued for write but * not yet completed on that list. So, as a final cleanup we go through * the osync code to catch these locked, dirty buffers without requeuing * any newly dirty buffers for write. */ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct list_head tmp; int err = 0, err2; INIT_LIST_HEAD(&tmp); spin_lock(lock); while (!list_empty(list)) { bh = BH_ENTRY(list->next); __remove_assoc_queue(bh); if (buffer_dirty(bh) || buffer_locked(bh)) { list_add(&bh->b_assoc_buffers, &tmp); if (buffer_dirty(bh)) { get_bh(bh); spin_unlock(lock); /* * Ensure any pending I/O completes so that * ll_rw_block() actually writes the current * contents - it is a noop if I/O is still in * flight on potentially older contents. */ ll_rw_block(SWRITE, 1, &bh); brelse(bh); spin_lock(lock); } } } while (!list_empty(&tmp)) { bh = BH_ENTRY(tmp.prev); list_del_init(&bh->b_assoc_buffers); get_bh(bh); spin_unlock(lock); wait_on_buffer(bh); if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); spin_lock(lock); } spin_unlock(lock); err2 = osync_buffers_list(lock, list); if (err) return err; else return err2; } /* * Invalidate any and all dirty buffers on a given inode. We are * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * * NOTE: we take the inode's blockdev's mapping's private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; struct address_space *buffer_mapping = mapping->assoc_mapping; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); spin_unlock(&buffer_mapping->private_lock); } } /* * Remove any clean buffers from the inode's buffer list. This is called * when we're trying to free the inode itself. Those buffers can pin it. * * Returns true if all buffers were removed. */ int remove_inode_buffers(struct inode *inode) { int ret = 1; if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; struct address_space *buffer_mapping = mapping->assoc_mapping; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { ret = 0; break; } __remove_assoc_queue(bh); } spin_unlock(&buffer_mapping->private_lock); } return ret; } /* * Create the appropriate buffers when given a page for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. * * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations. */ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, int retry) { struct buffer_head *bh, *head; long offset; try_again: head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { bh = alloc_buffer_head(GFP_NOFS); if (!bh) goto no_grow; bh->b_bdev = NULL; bh->b_this_page = head; bh->b_blocknr = -1; head = bh; bh->b_state = 0; atomic_set(&bh->b_count, 0); bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ set_bh_page(bh, page, offset); init_buffer(bh, NULL, NULL); } return head; /* * In case anything failed, we just free everything we got. */ no_grow: if (head) { do { bh = head; head = head->b_this_page; free_buffer_head(bh); } while (head); } /* * Return failure for non-async IO requests. Async IO requests * are not allowed to fail, so we have to wait until buffer heads * become available. But we don't want tasks sleeping with * partially complete buffers, so all were released above. */ if (!retry) return NULL; /* We're _really_ low on memory. Now we just * wait for old buffer heads to become free due to * finishing IO. Since this is an async request and * the reserve list is empty, we're sure there are * async buffer heads in use. */ free_more_memory(); goto try_again; } EXPORT_SYMBOL_GPL(alloc_page_buffers); static inline void link_dev_buffers(struct page *page, struct buffer_head *head) { struct buffer_head *bh, *tail; bh = head; do { tail = bh; bh = bh->b_this_page; } while (bh); tail->b_this_page = head; attach_page_buffers(page, head); } /* * Initialise the state of a blockdev page's buffers. */ static void init_page_buffers(struct page *page, struct block_device *bdev, sector_t block, int size) { struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); do { if (!buffer_mapped(bh)) { init_buffer(bh, NULL, NULL); bh->b_bdev = bdev; bh->b_blocknr = block; if (uptodate) set_buffer_uptodate(bh); set_buffer_mapped(bh); } block++; bh = bh->b_this_page; } while (bh != head); } /* * Create the page-cache page that contains the requested block. * * This is user purely for blockdev mappings. */ static struct page * grow_dev_page(struct block_device *bdev, sector_t block, pgoff_t index, int size) { struct inode *inode = bdev->bd_inode; struct page *page; struct buffer_head *bh; page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) return NULL; BUG_ON(!PageLocked(page)); if (page_has_buffers(page)) { bh = page_buffers(page); if (bh->b_size == size) { init_page_buffers(page, bdev, block, size); return page; } if (!try_to_free_buffers(page)) goto failed; } /* * Allocate some buffers for this page */ bh = alloc_page_buffers(page, size, 0); if (!bh) goto failed; /* * Link the page to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not * run under the page lock. */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); init_page_buffers(page, bdev, block, size); spin_unlock(&inode->i_mapping->private_lock); return page; failed: BUG(); unlock_page(page); page_cache_release(page); return NULL; } /* * Create buffers for the specified block device block's page. If * that page was dirty, the buffers are set dirty also. * * Except that's a bug. Attaching dirty buffers to a dirty * blockdev's page can result in filesystem corruption, because * some of those buffers may be aliases of filesystem data. * grow_dev_page() will go BUG() if this happens. */ static int grow_buffers(struct block_device *bdev, sector_t block, int size) { struct page *page; pgoff_t index; int sizebits; sizebits = -1; do { sizebits++; } while ((size << sizebits) < PAGE_SIZE); index = block >> sizebits; /* * Check for a block which wants to lie outside our maximum possible * pagecache index. (this comparison is done using sector_t types). */ if (unlikely(index != block >> sizebits)) { char b[BDEVNAME_SIZE]; printk(KERN_ERR "%s: requested out-of-range block %llu for " "device %s\n", __FUNCTION__, (unsigned long long)block, bdevname(bdev, b)); return -EIO; } block = index << sizebits; /* Create a page with the proper size buffers.. */ page = grow_dev_page(bdev, block, index, size); if (!page) return 0; unlock_page(page); page_cache_release(page); return 1; } static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, int size) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_hardsect_size(bdev)-1) || (size < 512 || size > PAGE_SIZE))) { printk(KERN_ERR "getblk(): invalid block size %d requested\n", size); printk(KERN_ERR "hardsect size: %d\n", bdev_hardsect_size(bdev)); dump_stack(); return NULL; } for (;;) { struct buffer_head * bh; int ret; bh = __find_get_block(bdev, block, size); if (bh) return bh; ret = grow_buffers(bdev, block, size); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } /* * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and * the page is tagged dirty in its radix tree. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is * merely a hint about the true dirty state. * * When a page is set dirty in its entirety, all its buffers are marked dirty * (if the page has buffers). * * When a buffer is marked dirty, its page is dirtied, but the page's other * buffers are not. * * Also. When blockdev buffers are explicitly read with bread(), they * individually become uptodate. But their backing page remains not * uptodate - even if all of its buffers are uptodate. A subsequent * block_read_full_page() against that page will discover all the uptodate * buffers, will set the page uptodate and will perform no I/O. */ /** * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * * mark_buffer_dirty() will set the dirty bit against the buffer, then set its * backing page dirty, then tag the page as dirty in its address_space's radix * tree and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, * mapping->tree_lock and the global inode_lock. */ void fastcall mark_buffer_dirty(struct buffer_head *bh) { if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) __set_page_dirty_nobuffers(bh->b_page); } /* * Decrement a buffer_head's reference count. If all buffers against a page * have zero reference count, are clean and unlocked, and if the page is clean * and unlocked then try_to_free_buffers() may strip the buffers from the page * in preparation for freeing it (sometimes, rarely, buffers are removed from * a page but it ends up not being freed, and buffers may later be reattached). */ void __brelse(struct buffer_head * buf) { if (atomic_read(&buf->b_count)) { put_bh(buf); return; } printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); WARN_ON(1); } /* * bforget() is like brelse(), except it discards any * potentially dirty data. */ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (!list_empty(&bh->b_assoc_buffers)) { struct address_space *buffer_mapping = bh->b_page->mapping; spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; spin_unlock(&buffer_mapping->private_lock); } __brelse(bh); } static struct buffer_head *__bread_slow(struct buffer_head *bh) { lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); return bh; } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; } brelse(bh); return NULL; } /* * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their * refcount elevated by one when they're in an LRU. A buffer can only appear * once in a particular CPU's LRU. A single buffer can be present in multiple * CPU's LRUs at the same time. * * This is a transparent caching front-end to sb_bread(), sb_getblk() and * sb_find_get_block(). * * The LRUs themselves only need locking against invalidate_bh_lrus. We use * a local interrupt disable for that. */ #define BH_LRU_SIZE 8 struct bh_lru { struct buffer_head *bhs[BH_LRU_SIZE]; }; static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; #ifdef CONFIG_SMP #define bh_lru_lock() local_irq_disable() #define bh_lru_unlock() local_irq_enable() #else #define bh_lru_lock() preempt_disable() #define bh_lru_unlock() preempt_enable() #endif static inline void check_irqs_on(void) { #ifdef irqs_disabled BUG_ON(irqs_disabled()); #endif } /* * The LRU management algorithm is dopey-but-simple. Sorry. */ static void bh_lru_install(struct buffer_head *bh) { struct buffer_head *evictee = NULL; struct bh_lru *lru; check_irqs_on(); bh_lru_lock(); lru = &__get_cpu_var(bh_lrus); if (lru->bhs[0] != bh) { struct buffer_head *bhs[BH_LRU_SIZE]; int in; int out = 0; get_bh(bh); bhs[out++] = bh; for (in = 0; in < BH_LRU_SIZE; in++) { struct buffer_head *bh2 = lru->bhs[in]; if (bh2 == bh) { __brelse(bh2); } else { if (out >= BH_LRU_SIZE) { BUG_ON(evictee != NULL); evictee = bh2; } else { bhs[out++] = bh2; } } } while (out < BH_LRU_SIZE) bhs[out++] = NULL; memcpy(lru->bhs, bhs, sizeof(bhs)); } bh_lru_unlock(); if (evictee) __brelse(evictee); } /* * Look up the bh in this cpu's LRU. If it's there, move it to the head. */ static struct buffer_head * lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *ret = NULL; struct bh_lru *lru; unsigned int i; check_irqs_on(); bh_lru_lock(); lru = &__get_cpu_var(bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = lru->bhs[i]; if (bh && bh->b_bdev == bdev && bh->b_blocknr == block && bh->b_size == size) { if (i) { while (i) { lru->bhs[i] = lru->bhs[i - 1]; i--; } lru->bhs[0] = bh; } get_bh(bh); ret = bh; break; } } bh_lru_unlock(); return ret; } /* * Perform a pagecache lookup for the matching buffer. If it's there, refresh * it in the LRU and mark it as accessed. If it is not present then return * NULL */ struct buffer_head * __find_get_block(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { bh = __find_get_block_slow(bdev, block); if (bh) bh_lru_install(bh); } if (bh) touch_buffer(bh); return bh; } EXPORT_SYMBOL(__find_get_block); /* * __getblk will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * * __getblk() cannot fail - it just keeps trying. If you pass it an * illegal block number, __getblk() will happily return a buffer_head * which represents the non-existent block. Very weird. * * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() * attempt is failing. FIXME, perhaps? */ struct buffer_head * __getblk(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) bh = __getblk_slow(bdev, block, size); return bh; } EXPORT_SYMBOL(__getblk); /* * Do async read-ahead on a buffer.. */ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { ll_rw_block(READA, 1, &bh); brelse(bh); } } EXPORT_SYMBOL(__breadahead); /** * __bread() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read * * Reads a specified block, and returns buffer head that contains it. * It returns NULL if the block was unreadable. */ struct buffer_head * __bread(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } EXPORT_SYMBOL(__bread); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq * or with preempt disabled. */ static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); int i; for (i = 0; i < BH_LRU_SIZE; i++) { brelse(b->bhs[i]); b->bhs[i] = NULL; } put_cpu_var(bh_lrus); } static void invalidate_bh_lrus(void) { on_each_cpu(invalidate_bh_lru, NULL, 1, 1); } void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { bh->b_page = page; BUG_ON(offset >= PAGE_SIZE); if (PageHighMem(page)) /* * This catches illegal uses and preserves the offset: */ bh->b_data = (char *)(0 + offset); else bh->b_data = page_address(page) + offset; } EXPORT_SYMBOL(set_bh_page); /* * Called when truncating a buffer on a page completely. */ static void discard_buffer(struct buffer_head * bh) { lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); unlock_buffer(bh); } /** * block_invalidatepage - invalidate part of all of a buffer-backed page * * @page: the page which is affected * @offset: the index of the truncation point * * block_invalidatepage() is called when all or part of the page has become * invalidatedby a truncate operation. * * block_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void block_invalidatepage(struct page *page, unsigned long offset) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) goto out; head = page_buffers(page); bh = head; do { unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; /* * is this block fully invalidated? */ if (offset <= curr_off) discard_buffer(bh); curr_off = next_off; bh = next; } while (bh != head); /* * We release buffers only if the entire page is being invalidated. * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ if (offset == 0) try_to_release_page(page, 0); out: return; } EXPORT_SYMBOL(block_invalidatepage); /* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; head = alloc_page_buffers(page, blocksize, 1); bh = head; do { bh->b_state |= b_state; tail = bh; bh = bh->b_this_page; } while (bh); tail->b_this_page = head; spin_lock(&page->mapping->private_lock); if (PageUptodate(page) || PageDirty(page)) { bh = head; do { if (PageDirty(page)) set_buffer_dirty(bh); if (PageUptodate(page)) set_buffer_uptodate(bh); bh = bh->b_this_page; } while (bh != head); } attach_page_buffers(page, head); spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); /* * We are taking a block for data and we don't want any output from any * buffer-cache aliases starting from return from that function and * until the moment when something will explicitly mark the buffer * dirty (hopefully that will not happen until we will free that block ;-) * We don't even need to mark it not-uptodate - nobody can expect * anything from a newly allocated buffer anyway. We used to used * unmap_buffer() for such invalidation, but that was wrong. We definitely * don't want to mark the alias unmapped, for example - it would confuse * anyone who might pick it with bread() afterwards... * * Also.. Note that bforget() doesn't lock the buffer. So there can * be writeout I/O going on against recently-freed buffers. We don't * wait on that I/O in bforget() - it's more efficient to wait on the I/O * only if we really need to. That happens here. */ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) { struct buffer_head *old_bh; might_sleep(); old_bh = __find_get_block_slow(bdev, block); if (old_bh) { clear_buffer_dirty(old_bh); wait_on_buffer(old_bh); clear_buffer_req(old_bh); __brelse(old_bh); } } EXPORT_SYMBOL(unmap_underlying_metadata); /* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning * * No No "unknown" - must do get_block() * No Yes "hole" - zero-filled * Yes No "allocated" - allocated on disk, not read in * Yes Yes "valid" - allocated and up-to-date in memory. * * "Dirty" is valid only with the last case (mapped+uptodate). */ /* * While block_write_full_page is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * * If block_write_full_page() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc) { int err; sector_t block; sector_t last_block; struct buffer_head *bh, *head; const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; BUG_ON(!PageLocked(page)); last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; if (!page_has_buffers(page)) { create_empty_buffers(page, blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; * handle that here by just cleaning them. */ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); head = page_buffers(page); bh = head; /* * Get all the dirty buffers mapped to disk addresses and * handle any aliases from the underlying blockdev's mapping. */ do { if (block > last_block) { /* * mapped buffers outside i_size will occur, because * this page can be outside i_size when there is a * truncate in progress. */ /* * The buffer was zeroed by block_write_full_page() */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } } bh = bh->b_this_page; block++; } while (bh != head); do { if (!buffer_mapped(bh)) continue; /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the page. Note that this can * potentially cause a busy-wait loop from pdflush and kswapd * activity, but those code paths have their own higher-level * throttling. */ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); } else if (test_set_buffer_locked(bh)) { redirty_page_for_writepage(wbc, page); continue; } if (test_clear_buffer_dirty(bh)) { mark_buffer_async_write(bh); } else { unlock_buffer(bh); } } while ((bh = bh->b_this_page) != head); /* * The page and its buffers are protected by PageWriteback(), so we can * drop the bh refcounts early. */ BUG_ON(PageWriteback(page)); set_page_writeback(page); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { submit_bh(WRITE, bh); nr_underway++; } bh = next; } while (bh != head); unlock_page(page); err = 0; done: if (nr_underway == 0) { /* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * ll_rw_block/submit_bh. A rare case. */ int uptodate = 1; do { if (!buffer_uptodate(bh)) { uptodate = 0; break; } bh = bh->b_this_page; } while (bh != head); if (uptodate) SetPageUptodate(page); end_page_writeback(page); /* * The page and buffer_heads can be released at any time from * here on. */ wbc->pages_skipped++; /* We didn't write this page */ } return err; recover: /* * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. * The page is currently locked and not marked for writeback */ bh = head; /* Recovery: lock and submit the mapped buffers */ do { if (buffer_mapped(bh) && buffer_dirty(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { /* * The buffer may have been set dirty during * attachment to a dirty page. */ clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); SetPageError(page); BUG_ON(PageWriteback(page)); set_page_writeback(page); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); submit_bh(WRITE, bh); nr_underway++; } bh = next; } while (bh != head); unlock_page(page); goto done; } static int __block_prepare_write(struct inode *inode, struct page *page, unsigned from, unsigned to, get_block_t *get_block) { unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; BUG_ON(!PageLocked(page)); BUG_ON(from > PAGE_CACHE_SIZE); BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); blocksize = 1 << inode->i_blkbits; if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); bbits = inode->i_blkbits; block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (PageUptodate(page)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; } if (block_end > to || block_start < from) { void *kaddr; kaddr = kmap_atomic(page, KM_USER0); if (block_end > to) memset(kaddr+to, 0, block_end-to); if (block_start < from) memset(kaddr+block_start, 0, from-block_start); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); } continue; } } if (PageUptodate(page)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; } } /* * If we issued read requests - let them complete. */ while(wait_bh > wait) { wait_on_buffer(*--wait_bh); if (!buffer_uptodate(*wait_bh)) err = -EIO; } if (!err) { bh = head; do { if (buffer_new(bh)) clear_buffer_new(bh); } while ((bh = bh->b_this_page) != head); return 0; } /* Error case: */ /* * Zero out any newly allocated blocks to avoid exposing stale * data. If BH_New is set, we know that the block was newly * allocated in the above loop. */ bh = head; block_start = 0; do { block_end = block_start+blocksize; if (block_end <= from) goto next_bh; if (block_start >= to) break; if (buffer_new(bh)) { void *kaddr; clear_buffer_new(bh); kaddr = kmap_atomic(page, KM_USER0); memset(kaddr+block_start, 0, bh->b_size); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); set_buffer_uptodate(bh); mark_buffer_dirty(bh); } next_bh: block_start = block_end; bh = bh->b_this_page; } while (bh != head); return err; } static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to) { unsigned block_start, block_end; int partial = 0; unsigned blocksize; struct buffer_head *bh, *head; blocksize = 1 << inode->i_blkbits; for(bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; } else { set_buffer_uptodate(bh); mark_buffer_dirty(bh); } } /* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus readpage() for * the next read(). Here we 'discover' whether the page went * uptodate as a result of this (potentially partial) write. */ if (!partial) SetPageUptodate(page); return 0; } /* * Generic "read page" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. * Reads the page asynchronously --- the unlock_buffer() and * set/clear_buffer_uptodate() functions propagate buffer state into the * page struct once IO has completed. */ int block_read_full_page(struct page *page, get_block_t *get_block) { struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; int nr, i; int fully_mapped = 1; BUG_ON(!PageLocked(page)); blocksize = 1 << inode->i_blkbits; if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; bh = head; nr = 0; i = 0; do { if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { int err = 0; fully_mapped = 0; if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); if (err) SetPageError(page); } if (!buffer_mapped(bh)) { void *kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + i * blocksize, 0, blocksize); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); if (!err) set_buffer_uptodate(bh); continue; } /* * get_block() might have updated the buffer * synchronously */ if (buffer_uptodate(bh)) continue; } arr[nr++] = bh; } while (i++, iblock++, (bh = bh->b_this_page) != head); if (fully_mapped) SetPageMappedToDisk(page); if (!nr) { /* * All buffers are uptodate - we can set the page uptodate * as well. But not if get_block() returned an error. */ if (!PageError(page)) SetPageUptodate(page); unlock_page(page); return 0; } /* Stage two: lock the buffers */ for (i = 0; i < nr; i++) { bh = arr[i]; lock_buffer(bh); mark_buffer_async_read(bh); } /* * Stage 3: start the IO. Check for uptodateness * inside the buffer lock in case another process reading * the underlying blockdev brought it uptodate (the sct fix). */ for (i = 0; i < nr; i++) { bh = arr[i]; if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else submit_bh(READ, bh); } return 0; } /* utility function for filesystems that need to do work on expanding * truncates. Uses prepare/commit_write to allow the filesystem to * deal with the hole. */ static int __generic_cont_expand(struct inode *inode, loff_t size, pgoff_t index, unsigned int offset) { struct address_space *mapping = inode->i_mapping; struct page *page; unsigned long limit; int err; err = -EFBIG; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && size > (loff_t)limit) { send_sig(SIGXFSZ, current, 0); goto out; } if (size > inode->i_sb->s_maxbytes) goto out; err = -ENOMEM; page = grab_cache_page(mapping, index); if (!page) goto out; err = mapping->a_ops->prepare_write(NULL, page, offset, offset); if (err) { /* * ->prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ unlock_page(page); page_cache_release(page); vmtruncate(inode, inode->i_size); goto out; } err = mapping->a_ops->commit_write(NULL, page, offset, offset); unlock_page(page); page_cache_release(page); if (err > 0) err = 0; out: return err; } int generic_cont_expand(struct inode *inode, loff_t size) { pgoff_t index; unsigned int offset; offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ /* ugh. in prepare/commit_write, if from==to==start of block, we ** skip the prepare. make sure we never send an offset for the start ** of a block */ if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { /* caller must handle this extra byte. */ offset++; } index = size >> PAGE_CACHE_SHIFT; return __generic_cont_expand(inode, size, index, offset); } int generic_cont_expand_simple(struct inode *inode, loff_t size) { loff_t pos = size - 1; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; /* prepare/commit_write can handle even if from==to==start of block. */ return __generic_cont_expand(inode, size, index, offset); } /* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct page *new_page; pgoff_t pgpos; long status; unsigned zerofrom; unsigned blocksize = 1 << inode->i_blkbits; void *kaddr; while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { status = -ENOMEM; new_page = grab_cache_page(mapping, pgpos);


context:
space:
mode: