aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/sfc/vfdi.h
blob: ae044f44936a2fd49bf9ff2728dd6e7fe7700baf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/****************************************************************************
 * Driver for Solarflare network controllers and boards
 * Copyright 2010-2012 Solarflare Communications Inc.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation, incorporated herein by reference.
 */
#ifndef _VFDI_H
#define _VFDI_H

/**
 * DOC: Virtual Function Driver Interface
 *
 * This file contains software structures used to form a two way
 * communication channel between the VF driver and the PF driver,
 * named Virtual Function Driver Interface (VFDI).
 *
 * For the purposes of VFDI, a page is a memory region with size and
 * alignment of 4K.  All addresses are DMA addresses to be used within
 * the domain of the relevant VF.
 *
 * The only hardware-defined channels for a VF driver to communicate
 * with the PF driver are the event mailboxes (%FR_CZ_USR_EV
 * registers).  Writing to these registers generates an event with
 * EV_CODE = EV_CODE_USR_EV, USER_QID set to the index of the mailbox
 * and USER_EV_REG_VALUE set to the value written.  The PF driver may
 * direct or disable delivery of these events by setting
 * %FR_CZ_USR_EV_CFG.
 *
 * The PF driver can send arbitrary events to arbitrary event queues.
 * However, for consistency, VFDI events from the PF are defined to
 * follow the same form and be sent to the first event queue assigned
 * to the VF while that queue is enabled by the VF driver.
 *
 * The general form of the variable bits of VFDI events is:
 *
 *       0             16                       24   31
 *      | DATA        | TYPE                   | SEQ   |
 *
 * SEQ is a sequence number which should be incremented by 1 (modulo
 * 256) for each event.  The sequence numbers used in each direction
 * are independent.
 *
 * The VF submits requests of type &struct vfdi_req by sending the
 * address of the request (ADDR) in a series of 4 events:
 *
 *       0             16                       24   31
 *      | ADDR[0:15]  | VFDI_EV_TYPE_REQ_WORD0 | SEQ   |
 *      | ADDR[16:31] | VFDI_EV_TYPE_REQ_WORD1 | SEQ+1 |
 *      | ADDR[32:47] | VFDI_EV_TYPE_REQ_WORD2 | SEQ+2 |
 *      | ADDR[48:63] | VFDI_EV_TYPE_REQ_WORD3 | SEQ+3 |
 *
 * The address must be page-aligned.  After receiving such a valid
 * series of events, the PF driver will attempt to read the request
 * and write a response to the same address.  In case of an invalid
 * sequence of events or a DMA error, there will be no response.
 *
 * The VF driver may request that the PF driver writes status
 * information into its domain asynchronously.  After writing the
 * status, the PF driver will send an event of the form:
 *
 *       0             16                       24   31
 *      | reserved    | VFDI_EV_TYPE_STATUS    | SEQ   |
 *
 * In case the VF must be reset for any reason, the PF driver will
 * send an event of the form:
 *
 *       0             16                       24   31
 *      | reserved    | VFDI_EV_TYPE_RESET     | SEQ   |
 *
 * It is then the responsibility of the VF driver to request
 * reinitialisation of its queues.
 */
#define VFDI_EV_SEQ_LBN 24
#define VFDI_EV_SEQ_WIDTH 8
#define VFDI_EV_TYPE_LBN 16
#define VFDI_EV_TYPE_WIDTH 8
#define VFDI_EV_TYPE_REQ_WORD0 0
#define VFDI_EV_TYPE_REQ_WORD1 1
#define VFDI_EV_TYPE_REQ_WORD2 2
#define VFDI_EV_TYPE_REQ_WORD3 3
#define VFDI_EV_TYPE_STATUS 4
#define VFDI_EV_TYPE_RESET 5
#define VFDI_EV_DATA_LBN 0
#define VFDI_EV_DATA_WIDTH 16

struct vfdi_endpoint {
	u8 mac_addr[ETH_ALEN];
	__be16 tci;
};

/**
 * enum vfdi_op - VFDI operation enumeration
 * @VFDI_OP_RESPONSE: Indicates a response to the request.
 * @VFDI_OP_INIT_EVQ: Initialize SRAM entries and initialize an EVQ.
 * @VFDI_OP_INIT_RXQ: Initialize SRAM entries and initialize an RXQ.
 * @VFDI_OP_INIT_TXQ: Initialize SRAM entries and initialize a TXQ.
 * @VFDI_OP_FINI_ALL_QUEUES: Flush all queues, finalize all queues, then
 *	finalize the SRAM entries.
 * @VFDI_OP_INSERT_FILTER: Insert a MAC filter targetting the given RXQ.
 * @VFDI_OP_REMOVE_ALL_FILTERS: Remove all filters.
 * @VFDI_OP_SET_STATUS_PAGE: Set the DMA page(s) used for status updates
 *	from PF and write the initial status.
 * @VFDI_OP_CLEAR_STATUS_PAGE: Clear the DMA page(s) used for status
 *	updates from PF.
 */
enum vfdi_op {
	VFDI_OP_RESPONSE = 0,
	VFDI_OP_INIT_EVQ = 1,
	VFDI_OP_INIT_RXQ = 2,
	VFDI_OP_INIT_TXQ = 3,
	VFDI_OP_FINI_ALL_QUEUES = 4,
	VFDI_OP_INSERT_FILTER = 5,
	VFDI_OP_REMOVE_ALL_FILTERS = 6,
	VFDI_OP_SET_STATUS_PAGE = 7,
	VFDI_OP_CLEAR_STATUS_PAGE = 8,
	VFDI_OP_LIMIT,
};

/* Response codes for VFDI operations. Other values may be used in future. */
#define VFDI_RC_SUCCESS		0
#define VFDI_RC_ENOMEM		(-12)
#define VFDI_RC_EINVAL		(-22)
#define VFDI_RC_EOPNOTSUPP	(-95)
#define VFDI_RC_ETIMEDOUT	(-110)

/**
 * struct vfdi_req - Request from VF driver to PF driver
 * @op: Operation code or response indicator, taken from &enum vfdi_op.
 * @rc: Response code.  Set to 0 on success or a negative error code on failure.
 * @u.init_evq.index: Index of event queue to create.
 * @u.init_evq.buf_count: Number of 4k buffers backing event queue.
 * @u.init_evq.addr: Array of length %u.init_evq.buf_count containing DMA
 *	address of each page backing the event queue.
 * @u.init_rxq.index: Index of receive queue to create.
 * @u.init_rxq.buf_count: Number of 4k buffers backing receive queue.
 * @u.init_rxq.evq: Instance of event queue to target receive events at.
 * @u.init_rxq.label: Label used in receive events.
 * @u.init_rxq.flags: Unused.
 * @u.init_rxq.addr: Array of length %u.init_rxq.buf_count containing DMA
 *	address of each page backing the receive queue.
 * @u.init_txq.index: Index of transmit queue to create.
 * @u.init_txq.buf_count: Number of 4k buffers backing transmit queue.
 * @u.init_txq.evq: Instance of event queue to target transmit completion
 *	events at.
 * @u.init_txq.label: Label used in transmit completion events.
 * @u.init_txq.flags: Checksum offload flags.
 * @u.init_txq.addr: Array of length %u.init_txq.buf_count containing DMA
 *	address of each page backing the transmit queue.
 * @u.mac_filter.rxq: Insert MAC filter at VF local address/VLAN targetting
 *	all traffic at this receive queue.
 * @u.mac_filter.flags: MAC filter flags.
 * @u.set_status_page.dma_addr: Base address for the &struct vfdi_status.
 *	This address must be page-aligned and the PF may write up to a
 *	whole page (allowing for extension of the structure).
 * @u.set_status_page.peer_page_count: Number of additional pages the VF
 *	has provided into which peer addresses may be DMAd.
 * @u.set_status_page.peer_page_addr: Array of DMA addresses of pages.
 *	If the number of peers exceeds 256, then the VF must provide
 *	additional pages in this array. The PF will then DMA up to
 *	512 vfdi_endpoint structures into each page.  These addresses
 *	must be page-aligned.
 */
struct vfdi_req {
	u32 op;
	u32 reserved1;
	s32 rc;
	u32 reserved2;
	union {
		struct {
			u32 index;
			u32 buf_count;
			u64 addr[];
		} init_evq;
		struct {
			u32 index;
			u32 buf_count;
			u32 evq;
			u32 label;
			u32 flags;
#define VFDI_RXQ_FLAG_SCATTER_EN 1
			u32 reserved;
			u64 addr[];
		} init_rxq;
		struct {
			u32 index;
			u32 buf_count;
			u32 evq;
			u32 label;
			u32 flags;
#define VFDI_TXQ_FLAG_IP_CSUM_DIS 1
#define VFDI_TXQ_FLAG_TCPUDP_CSUM_DIS 2
			u32 reserved;
			u64 addr[];
		} init_txq;
		struct {
			u32 rxq;
			u32 flags;
#define VFDI_MAC_FILTER_FLAG_RSS 1
#define VFDI_MAC_FILTER_FLAG_SCATTER 2
		} mac_filter;
		struct {
			u64 dma_addr;
			u64 peer_page_count;
			u64 peer_page_addr[];
		} set_status_page;
	} u;
};

/**
 * struct vfdi_status - Status provided by PF driver to VF driver
 * @generation_start: A generation count DMA'd to VF *before* the
 *	rest of the structure.
 * @generation_end: A generation count DMA'd to VF *after* the
 *	rest of the structure.
 * @version: Version of this structure; currently set to 1.  Later
 *	versions must either be layout-compatible or only be sent to VFs
 *	that specifically request them.
 * @length: Total length of this structure including embedded tables
 * @vi_scale: log2 the number of VIs available on this VF. This quantity
 *	is used by the hardware for register decoding.
 * @max_tx_channels: The maximum number of transmit queues the VF can use.
 * @rss_rxq_count: The number of receive queues present in the shared RSS
 *	indirection table.
 * @peer_count: Total number of peers in the complete peer list. If larger
 *	than ARRAY_SIZE(%peers), then the VF must provide sufficient
 *	additional pages each of which is filled with vfdi_endpoint structures.
 * @local: The MAC address and outer VLAN tag of *this* VF
 * @peers: Table of peer addresses.  The @tci fields in these structures
 *	are currently unused and must be ignored.  Additional peers are
 *	written into any additional pages provided by the VF.
 * @timer_quantum_ns: Timer quantum (nominal period between timer ticks)
 *	for interrupt moderation timers, in nanoseconds. This member is only
 *	present if @length is sufficiently large.
 */
struct vfdi_status {
	u32 generation_start;
	u32 generation_end;
	u32 version;
	u32 length;
	u8 vi_scale;
	u8 max_tx_channels;
	u8 rss_rxq_count;
	u8 reserved1;
	u16 peer_count;
	u16 reserved2;
	struct vfdi_endpoint local;
	struct vfdi_endpoint peers[256];

	/* Members below here extend version 1 of this structure */
	u32 timer_quantum_ns;
};

#endif
ss="hl kwd">wake_up(&front->waitq); } } } /* * free a writeback record */ static void afs_free_writeback(struct afs_writeback *wb) { _enter(""); key_put(wb->key); kfree(wb); } /* * dispose of a reference to a writeback record */ void afs_put_writeback(struct afs_writeback *wb) { struct afs_vnode *vnode = wb->vnode; _enter("{%d}", wb->usage); spin_lock(&vnode->writeback_lock); if (--wb->usage == 0) afs_unlink_writeback(wb); else wb = NULL; spin_unlock(&vnode->writeback_lock); if (wb) afs_free_writeback(wb); } /* * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, unsigned start, unsigned len, struct page *page) { int ret; _enter(",,%u,%u", start, len); ASSERTCMP(start + len, <=, PAGE_SIZE); ret = afs_vnode_fetch_data(vnode, key, start, len, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" " - marking file deleted and stale"); set_bit(AFS_VNODE_DELETED, &vnode->flags); ret = -ESTALE; } } _leave(" = %d", ret); return ret; } /* * prepare a page for being written to */ static int afs_prepare_page(struct afs_vnode *vnode, struct page *page, struct key *key, unsigned offset, unsigned to) { unsigned eof, tail, start, stop, len; loff_t i_size, pos; void *p; int ret; _enter(""); if (offset == 0 && to == PAGE_SIZE) return 0; p = kmap_atomic(page, KM_USER0); i_size = i_size_read(&vnode->vfs_inode); pos = (loff_t) page->index << PAGE_SHIFT; if (pos >= i_size) { /* partial write, page beyond EOF */ _debug("beyond"); if (offset > 0) memset(p, 0, offset); if (to < PAGE_SIZE) memset(p + to, 0, PAGE_SIZE - to); kunmap_atomic(p, KM_USER0); return 0; } if (i_size - pos >= PAGE_SIZE) { /* partial write, page entirely before EOF */ _debug("before"); tail = eof = PAGE_SIZE; } else { /* partial write, page overlaps EOF */ eof = i_size - pos; _debug("overlap %u", eof); tail = max(eof, to); if (tail < PAGE_SIZE) memset(p + tail, 0, PAGE_SIZE - tail); if (offset > eof) memset(p + eof, 0, PAGE_SIZE - eof); } kunmap_atomic(p, KM_USER0); ret = 0; if (offset > 0 || eof > to) { /* need to fill one or two bits that aren't going to be written * (cover both fillers in one read if there are two) */ start = (offset > 0) ? 0 : to; stop = (eof > to) ? eof : offset; len = stop - start; _debug("wr=%u-%u av=0-%u rd=%u@%u", offset, to, eof, start, len); ret = afs_fill_page(vnode, key, start, len, page); } _leave(" = %d", ret); return ret; } /* * prepare to perform part of a write to a page * - the caller holds the page locked, preventing it from being written out or * modified by anyone else */ int afs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { struct afs_writeback *candidate, *wb; struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); struct key *key = file->private_data; pgoff_t index; int ret; _enter("{%x:%u},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); if (!candidate) return -ENOMEM; candidate->vnode = vnode; candidate->first = candidate->last = page->index; candidate->offset_first = offset; candidate->to_last = to; candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); if (!PageUptodate(page)) { _debug("not up to date"); ret = afs_prepare_page(vnode, page, key, offset, to); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); return ret; } } try_again: index = page->index; spin_lock(&vnode->writeback_lock); /* see if this page is already pending a writeback under a suitable key * - if so we can just join onto that one */ wb = (struct afs_writeback *) page_private(page); if (wb) { if (wb->key == key && wb->state == AFS_WBACK_PENDING) goto subsume_in_current_wb; goto flush_conflicting_wb; } if (index > 0) { /* see if we can find an already pending writeback that we can * append this page to */ list_for_each_entry(wb, &vnode->writebacks, link) { if (wb->last == index - 1 && wb->key == key && wb->state == AFS_WBACK_PENDING) goto append_to_previous_wb; } } list_add_tail(&candidate->link, &vnode->writebacks); candidate->key = key_get(key); spin_unlock(&vnode->writeback_lock); SetPagePrivate(page); set_page_private(page, (unsigned long) candidate); _leave(" = 0 [new]"); return 0; subsume_in_current_wb: _debug("subsume"); ASSERTRANGE(wb->first, <=, index, <=, wb->last); if (index == wb->first && offset < wb->offset_first) wb->offset_first = offset; if (index == wb->last && to > wb->to_last) wb->to_last = to; spin_unlock(&vnode->writeback_lock); kfree(candidate); _leave(" = 0 [sub]"); return 0; append_to_previous_wb: _debug("append into %lx-%lx", wb->first, wb->last); wb->usage++; wb->last++; wb->to_last = to; spin_unlock(&vnode->writeback_lock); SetPagePrivate(page); set_page_private(page, (unsigned long) wb); kfree(candidate); _leave(" = 0 [app]"); return 0; /* the page is currently bound to another context, so if it's dirty we * need to flush it before we can use the new context */ flush_conflicting_wb: _debug("flush conflict"); if (wb->state == AFS_WBACK_PENDING) wb->state = AFS_WBACK_CONFLICTING; spin_unlock(&vnode->writeback_lock); if (PageDirty(page)) { ret = afs_write_back_from_locked_page(wb, page); if (ret < 0) { afs_put_writeback(candidate); _leave(" = %d", ret); return ret; } } /* the page holds a ref on the writeback record */ afs_put_writeback(wb); set_page_private(page, 0); ClearPagePrivate(page); goto try_again; } /* * finalise part of a write to a page */ int afs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); loff_t i_size, maybe_i_size; _enter("{%x:%u},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); maybe_i_size = (loff_t) page->index << PAGE_SHIFT; maybe_i_size += to; i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { spin_lock(&vnode->writeback_lock); i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) i_size_write(&vnode->vfs_inode, maybe_i_size); spin_unlock(&vnode->writeback_lock); } SetPageUptodate(page); set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); return 0; } /* * kill all the pages in the given range */ static void afs_kill_pages(struct afs_vnode *vnode, bool error, pgoff_t first, pgoff_t last) { struct pagevec pv; unsigned count, loop; _enter("{%x:%u},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv, 0); do { _debug("kill %lx-%lx", first, last); count = last - first + 1; if (count > PAGEVEC_SIZE) count = PAGEVEC_SIZE; pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, first, count, pv.pages); ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { ClearPageUptodate(pv.pages[loop]); if (error) SetPageError(pv.pages[loop]); end_page_writeback(pv.pages[loop]); } __pagevec_release(&pv); } while (first < last); _leave(""); } /* * synchronously write back the locked page and any subsequent non-locked dirty * pages also covered by the same writeback record */ static int afs_write_back_from_locked_page(struct afs_writeback *wb, struct page *primary_page) { struct page *pages[8], *page; unsigned long count; unsigned n, offset, to; pgoff_t start, first, last; int loop, ret; _enter(",%lx", primary_page->index); count = 1; if (!clear_page_dirty_for_io(primary_page)) BUG(); if (test_set_page_writeback(primary_page)) BUG(); /* find all consecutive lockable dirty pages, stopping when we find a * page that is not immediately lockable, is not dirty or is missing, * or we reach the end of the range */ start = primary_page->index; if (start >= wb->last) goto no_more; start++; do { _debug("more %lx [%lx]", start, count); n = wb->last - start + 1; if (n > ARRAY_SIZE(pages)) n = ARRAY_SIZE(pages); n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, start, n, pages); _debug("fgpc %u", n); if (n == 0) goto no_more; if (pages[0]->index != start) { do { put_page(pages[--n]); } while (n > 0); goto no_more; } for (loop = 0; loop < n; loop++) { page = pages[loop]; if (page->index > wb->last) break; if (TestSetPageLocked(page)) break; if (!PageDirty(page) || page_private(page) != (unsigned long) wb) { unlock_page(page); break; } if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) BUG(); unlock_page(page); put_page(page); } count += loop; if (loop < n) { for (; loop < n; loop++) put_page(pages[loop]); goto no_more; } start += loop; } while (start <= wb->last && count < 65536); no_more: /* we now have a contiguous set of dirty pages, each with writeback set * and the dirty mark cleared; the first page is locked and must remain * so, all the rest are unlocked */ first = primary_page->index; last = first + count - 1; offset = (first == wb->first) ? wb->offset_first : 0; to = (last == wb->last) ? wb->to_last : PAGE_SIZE; _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); ret = afs_vnode_store_data(wb, first, last, offset, to); if (ret < 0) { switch (ret) { case -EDQUOT: case -ENOSPC: set_bit(AS_ENOSPC, &wb->vnode->vfs_inode.i_mapping->flags); break; case -EROFS: case -EIO: case -EREMOTEIO: case -EFBIG: case -ENOENT: case -ENOMEDIUM: case -ENXIO: afs_kill_pages(wb->vnode, true, first, last); set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); break; case -EACCES: case -EPERM: case -ENOKEY: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: afs_kill_pages(wb->vnode, false, first, last); break; default: break; } } else { ret = count; } _leave(" = %d", ret); return ret; } /* * write a page back to the server * - the caller locked the page for us */ int afs_writepage(struct page *page, struct writeback_control *wbc) { struct backing_dev_info *bdi = page->mapping->backing_dev_info; struct afs_writeback *wb; int ret; _enter("{%lx},", page->index); wb = (struct afs_writeback *) page_private(page); ASSERT(wb != NULL); ret = afs_write_back_from_locked_page(wb, page); unlock_page(page); if (ret < 0) { _leave(" = %d", ret); return 0; } wbc->nr_to_write -= ret; if (wbc->nonblocking && bdi_write_congested(bdi)) wbc->encountered_congestion = 1; _leave(" = 0"); return 0; } /* * write a region of pages back to the server */ int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { struct backing_dev_info *bdi = mapping->backing_dev_info; struct afs_writeback *wb; struct page *page; int ret, n; _enter(",,%lx,%lx,", index, end); do { n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; _debug("wback %lx", page->index); if (page->index > end) { *_next = index; page_cache_release(page); _leave(" = 0 [%lx]", *_next); return 0; } /* at this point we hold neither mapping->tree_lock nor lock on * the page itself: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled back from * swapper_space to tmpfs file mapping */ lock_page(page); if (page->mapping != mapping) { unlock_page(page); page_cache_release(page); continue; } if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); if (PageWriteback(page) || !PageDirty(page)) { unlock_page(page); continue; } wb = (struct afs_writeback *) page_private(page); ASSERT(wb != NULL); spin_lock(&wb->vnode->writeback_lock); wb->state = AFS_WBACK_WRITING; spin_unlock(&wb->vnode->writeback_lock); ret = afs_write_back_from_locked_page(wb, page); unlock_page(page); page_cache_release(page); if (ret < 0) { _leave(" = %d", ret); return ret; } wbc->nr_to_write -= ret; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; break; } cond_resched(); } while (index < end && wbc->nr_to_write > 0); *_next = index; _leave(" = 0 [%lx]", *_next); return 0; } /* * write some of the pending data back to the server */ int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct backing_dev_info *bdi = mapping->backing_dev_info; pgoff_t start, end, next; int ret; _enter(""); if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; _leave(" = 0 [congest]"); return 0; } if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; ret = afs_writepages_region(mapping, wbc, start, end, &next); if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && !(wbc->nonblocking && wbc->encountered_congestion)) ret = afs_writepages_region(mapping, wbc, 0, start, &next); mapping->writeback_index = next; } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); ret = afs_writepages_region(mapping, wbc, 0, end, &next); if (wbc->nr_to_write > 0) mapping->writeback_index = next; } else { start = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; ret = afs_writepages_region(mapping, wbc, start, end, &next); } _leave(" = %d", ret); return ret; } /* * write an inode back */ int afs_write_inode(struct inode *inode, int sync) { struct afs_vnode *vnode = AFS_FS_I(inode); int ret; _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); ret = 0; if (sync) { ret = filemap_fdatawait(inode->i_mapping); if (ret < 0) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } _leave(" = %d", ret); return ret; } /* * completion of write to server */ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) { struct afs_writeback *wb = call->wb; struct pagevec pv; unsigned count, loop; pgoff_t first = call->first, last = call->last; bool free_wb; _enter("{%x:%u},{%lx-%lx}", vnode->fid.vid, vnode->fid.vnode, first, last); ASSERT(wb != NULL); pagevec_init(&pv, 0); do { _debug("done %lx-%lx", first, last); count = last - first + 1; if (count > PAGEVEC_SIZE) count = PAGEVEC_SIZE; pv.nr = find_get_pages_contig(call->mapping, first, count, pv.pages); ASSERTCMP(pv.nr, ==, count); spin_lock(&vnode->writeback_lock); for (loop = 0; loop < count; loop++) { struct page *page = pv.pages[loop]; end_page_writeback(page); if (page_private(page) == (unsigned long) wb) { set_page_private(page, 0); ClearPagePrivate(page); wb->usage--; } } free_wb = false; if (wb->usage == 0) { afs_unlink_writeback(wb); free_wb = true; } spin_unlock(&vnode->writeback_lock); first += count; if (free_wb) { afs_free_writeback(wb); wb = NULL; } __pagevec_release(&pv); } while (first <= last); _leave(""); } /* * write to an AFS file */ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct dentry *dentry = iocb->ki_filp->f_path.dentry; struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); ssize_t result; size_t count = iov_length(iov, nr_segs); int ret; _enter("{%x.%u},{%zu},%lu,", vnode->fid.vid, vnode->fid.vnode, count, nr_segs); if (IS_SWAPFILE(&vnode->vfs_inode)) { printk(KERN_INFO "AFS: Attempt to write to active swap file!\n"); return -EBUSY; } if (!count) return 0; result = generic_file_aio_write(iocb, iov, nr_segs, pos); if (IS_ERR_VALUE(result)) { _leave(" = %zd", result); return result; } /* return error values for O_SYNC and IS_SYNC() */ if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) { ret = afs_fsync(iocb->ki_filp, dentry, 1); if (ret < 0) result = ret; } _leave(" = %zd", result); return result; } /* * flush the vnode to the fileserver */ int afs_writeback_all(struct afs_vnode *vnode) { struct address_space *mapping = vnode->vfs_inode.i_mapping; struct writeback_control wbc = { .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_writepages = 1, .range_cyclic = 1, }; int ret; _enter(""); ret = mapping->a_ops->writepages(mapping, &wbc); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); _leave(" = %d", ret); return ret; } /* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. */ int afs_fsync(struct file *file, struct dentry *dentry, int datasync) { struct afs_writeback *wb, *xwb; struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); int ret; _enter("{%x:%u},{n=%s},%d", vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, datasync); /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either * completed or rejected */ wb = kzalloc(sizeof(*wb), GFP_KERNEL); if (!wb) return -ENOMEM; wb->vnode = vnode; wb->first = 0; wb->last = -1; wb->offset_first = 0; wb->to_last = PAGE_SIZE; wb->usage = 1; wb->state = AFS_WBACK_SYNCING; init_waitqueue_head(&wb->waitq); spin_lock(&vnode->writeback_lock); list_for_each_entry(xwb, &vnode->writebacks, link) { if (xwb->state == AFS_WBACK_PENDING) xwb->state = AFS_WBACK_CONFLICTING; } list_add_tail(&wb->link, &vnode->writebacks); spin_unlock(&vnode->writeback_lock); /* push all the outstanding writebacks to the server */ ret = afs_writeback_all(vnode); if (ret < 0) { afs_put_writeback(wb); _leave(" = %d [wb]", ret); return ret; } /* wait for the preceding writes to actually complete */ ret = wait_event_interruptible(wb->waitq, wb->state == AFS_WBACK_COMPLETE || vnode->writebacks.next == &wb->link); afs_put_writeback(wb); _leave(" = %d", ret); return ret; }