From e96018280cb36210f4c69663561825114a57e7e1 Mon Sep 17 00:00:00 2001 From: "\\\"Talpey, Thomas\\" Date: Mon, 10 Sep 2007 13:50:42 -0400 Subject: RPCRDMA: rpc rdma protocol implementation This implements the marshaling and unmarshaling of the rpcrdma transport headers. Connection management is also addressed. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 867 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 863 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b0587f3a5d77..12db63580427 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,9 +1,868 @@ /* - * Placeholders for subsequent patches + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * rpc_rdma.c + * + * This file contains the guts of the RPC RDMA protocol, and + * does marshaling/unmarshaling, etc. It is also where interfacing + * to the Linux RPC framework lives. */ #include "xprt_rdma.h" -void rpcrdma_conn_func(struct rpcrdma_ep *a) { } -void rpcrdma_reply_handler(struct rpcrdma_rep *a) { } -int rpcrdma_marshal_req(struct rpc_rqst *a) { return EINVAL; } +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +#ifdef RPC_DEBUG +static const char transfertypes[][12] = { + "pure inline", /* no chunks */ + " read chunk", /* some argument via rdma read */ + "*read chunk", /* entire request via rdma read */ + "write chunk", /* some result via rdma write */ + "reply chunk" /* entire reply via rdma write */ +}; +#endif + +/* + * Chunk assembly from upper layer xdr_buf. + * + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk + * elements. Segments are then coalesced when registered, if possible + * within the selected memreg mode. + * + * Note, this routine is never called if the connection's memory + * registration strategy is 0 (bounce buffers). + */ + +static int +rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos, + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) +{ + int len, n = 0, p; + + if (pos == 0 && xdrbuf->head[0].iov_len) { + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->head[0].iov_base; + seg[n].mr_len = xdrbuf->head[0].iov_len; + pos += xdrbuf->head[0].iov_len; + ++n; + } + + if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) { + if (n == nsegs) + return 0; + seg[n].mr_page = xdrbuf->pages[0]; + seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base; + seg[n].mr_len = min_t(u32, + PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len); + len = xdrbuf->page_len - seg[n].mr_len; + pos += len; + ++n; + p = 1; + while (len > 0) { + if (n == nsegs) + return 0; + seg[n].mr_page = xdrbuf->pages[p]; + seg[n].mr_offset = NULL; + seg[n].mr_len = min_t(u32, PAGE_SIZE, len); + len -= seg[n].mr_len; + ++n; + ++p; + } + } + + if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) { + if (n == nsegs) + return 0; + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->tail[0].iov_base; + seg[n].mr_len = xdrbuf->tail[0].iov_len; + pos += xdrbuf->tail[0].iov_len; + ++n; + } + + if (pos < xdrbuf->len) + dprintk("RPC: %s: marshaled only %d of %d\n", + __func__, pos, xdrbuf->len); + + return n; +} + +/* + * Create read/write chunk lists, and reply chunks, for RDMA + * + * Assume check against THRESHOLD has been done, and chunks are required. + * Assume only encoding one list entry for read|write chunks. The NFSv3 + * protocol is simple enough to allow this as it only has a single "bulk + * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The + * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) + * + * When used for a single reply chunk (which is a special write + * chunk used for the entire reply, rather than just the data), it + * is used primarily for READDIR and READLINK which would otherwise + * be severely size-limited by a small rdma inline read max. The server + * response will come back as an RDMA Write, followed by a message + * of type RDMA_NOMSG carrying the xid and length. As a result, reply + * chunks do not provide data alignment, however they do not require + * "fixup" (moving the response to the upper layer buffer) either. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Read chunklist (a linked list): + * N elements, position P (same P for all chunks of same arg!): + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 + * + * Write chunklist (a list of (one) counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO - 0 + * + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO + */ + +static unsigned int +rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt); + int nsegs, nchunks = 0; + int pos; + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_read_chunk *cur_rchunk = NULL; + struct rpcrdma_write_array *warray = NULL; + struct rpcrdma_write_chunk *cur_wchunk = NULL; + u32 *iptr = headerp->rm_body.rm_chunks; + + if (type == rpcrdma_readch || type == rpcrdma_areadch) { + /* a read chunk - server will RDMA Read our memory */ + cur_rchunk = (struct rpcrdma_read_chunk *) iptr; + } else { + /* a write or reply chunk - server will RDMA Write our memory */ + *iptr++ = xdr_zero; /* encode a NULL read chunk list */ + if (type == rpcrdma_replych) + *iptr++ = xdr_zero; /* a NULL write chunk list */ + warray = (struct rpcrdma_write_array *) iptr; + cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); + } + + if (type == rpcrdma_replych || type == rpcrdma_areadch) + pos = 0; + else + pos = target->head[0].iov_len; + + nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); + if (nsegs == 0) + return 0; + + do { + /* bind/register the memory, then build chunk from result. */ + int n = rpcrdma_register_external(seg, nsegs, + cur_wchunk != NULL, r_xprt); + if (n <= 0) + goto out; + if (cur_rchunk) { /* read */ + cur_rchunk->rc_discrim = xdr_one; + /* all read chunks have the same "position" */ + cur_rchunk->rc_position = htonl(pos); + cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); + cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (u32 *)&cur_rchunk->rc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: read chunk " + "elem %d@0x%llx:0x%x pos %d (%s)\n", __func__, + seg->mr_len, seg->mr_base, seg->mr_rkey, pos, + n < nsegs ? "more" : "last"); + cur_rchunk++; + r_xprt->rx_stats.read_chunk_count++; + } else { /* write/reply */ + cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); + cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (u32 *)&cur_wchunk->wc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: %s chunk " + "elem %d@0x%llx:0x%x (%s)\n", __func__, + (type == rpcrdma_replych) ? "reply" : "write", + seg->mr_len, seg->mr_base, seg->mr_rkey, + n < nsegs ? "more" : "last"); + cur_wchunk++; + if (type == rpcrdma_replych) + r_xprt->rx_stats.reply_chunk_count++; + else + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + } + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + + /* success. all failures return above */ + req->rl_nchunks = nchunks; + + BUG_ON(nchunks == 0); + + /* + * finish off header. If write, marshal discrim and nchunks. + */ + if (cur_rchunk) { + iptr = (u32 *) cur_rchunk; + *iptr++ = xdr_zero; /* finish the read chunk list */ + *iptr++ = xdr_zero; /* encode a NULL write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } else { + warray->wc_discrim = xdr_one; + warray->wc_nchunks = htonl(nchunks); + iptr = (u32 *) cur_wchunk; + if (type == rpcrdma_writech) { + *iptr++ = xdr_zero; /* finish the write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } + } + + /* + * Return header size. + */ + return (unsigned char *)iptr - (unsigned char *)headerp; + +out: + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt, NULL); + return 0; +} + +/* + * Copy write data inline. + * This function is used for "small" requests. Data which is passed + * to RPC via iovecs (or page list) is copied directly into the + * pre-registered memory buffer for this request. For small amounts + * of data, this is efficient. The cutoff value is tunable. + */ +static int +rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +{ + int i, npages, curlen; + int copy_len; + unsigned char *srcp, *destp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + + destp = rqst->rq_svec[0].iov_base; + curlen = rqst->rq_svec[0].iov_len; + destp += curlen; + /* + * Do optional padding where it makes sense. Alignment of write + * payload can help the server, if our setting is accurate. + */ + pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); + if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) + pad = 0; /* don't pad this request */ + + dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", + __func__, pad, destp, rqst->rq_slen, curlen); + + copy_len = rqst->rq_snd_buf.page_len; + r_xprt->rx_stats.pullup_copy_count += copy_len; + npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; + for (i = 0; copy_len && i < npages; i++) { + if (i == 0) + curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base; + else + curlen = PAGE_SIZE; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", + __func__, i, destp, copy_len, curlen); + srcp = kmap_atomic(rqst->rq_snd_buf.pages[i], + KM_SKB_SUNRPC_DATA); + if (i == 0) + memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen); + else + memcpy(destp, srcp, curlen); + kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); + rqst->rq_svec[0].iov_len += curlen; + destp += curlen; + copy_len -= curlen; + } + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp != rqst->rq_snd_buf.tail[0].iov_base) { + memcpy(destp, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n", + __func__, destp, copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } + /* header now contains entire send message */ + return pad; +} + +/* + * Marshal a request: the primary job of this routine is to choose + * the transfer modes. See comments below. + * + * Uses multiple RDMA IOVs for a request: + * [0] -- RPC RDMA header, which uses memory from the *start* of the + * preregistered buffer that already holds the RPC data in + * its middle. + * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. + * [2] -- optional padding. + * [3] -- if padded, header only in [1] and data here. + */ + +int +rpcrdma_marshal_req(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + char *base; + size_t hdrlen, rpclen, padlen; + enum rpcrdma_chunktype rtype, wtype; + struct rpcrdma_msg *headerp; + + /* + * rpclen gets amount of data in first buffer, which is the + * pre-registered buffer. + */ + base = rqst->rq_svec[0].iov_base; + rpclen = rqst->rq_svec[0].iov_len; + + /* build RDMA header in private area at front */ + headerp = (struct rpcrdma_msg *) req->rl_base; + /* don't htonl XID, it's already done in request */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = xdr_one; + headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); + headerp->rm_type = __constant_htonl(RDMA_MSG); + + /* + * Chunks needed for results? + * + * o If the expected result is under the inline threshold, all ops + * return as inline (but see later). + * o Large non-read ops return as a single reply chunk. + * o Large read ops return data as write chunk(s), header as inline. + * + * Note: the NFS code sending down multiple result segments implies + * the op is one of read, readdir[plus], readlink or NFSv4 getacl. + */ + + /* + * This code can handle read chunks, write chunks OR reply + * chunks -- only one type. If the request is too big to fit + * inline, then we will choose read chunks. If the request is + * a READ, then use write chunks to separate the file data + * into pages; otherwise use reply chunks. + */ + if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) + wtype = rpcrdma_noch; + else if (rqst->rq_rcv_buf.page_len == 0) + wtype = rpcrdma_replych; + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + wtype = rpcrdma_writech; + else + wtype = rpcrdma_replych; + + /* + * Chunks needed for arguments? + * + * o If the total request is under the inline threshold, all ops + * are sent as inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). + * o Large write ops transmit data as read chunk(s), header as + * inline. + * + * Note: the NFS code sending down multiple argument segments + * implies the op is a write. + * TBD check NFSv4 setacl + */ + if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + rtype = rpcrdma_noch; + else if (rqst->rq_snd_buf.page_len == 0) + rtype = rpcrdma_areadch; + else + rtype = rpcrdma_readch; + + /* The following simplification is not true forever */ + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); + + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && + (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { + /* forced to "pure inline"? */ + dprintk("RPC: %s: too much data (%d/%d) for inline\n", + __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); + return -1; + } + + hdrlen = 28; /*sizeof *headerp;*/ + padlen = 0; + + /* + * Pull up any extra send data into the preregistered buffer. + * When padding is in use and applies to the transfer, insert + * it and change the message type. + */ + if (rtype == rpcrdma_noch) { + + padlen = rpcrdma_inline_pullup(rqst, + RPCRDMA_INLINE_PAD_VALUE(rqst)); + + if (padlen) { + headerp->rm_type = __constant_htonl(RDMA_MSGP); + headerp->rm_body.rm_padded.rm_align = + htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); + headerp->rm_body.rm_padded.rm_thresh = + __constant_htonl(RPCRDMA_INLINE_PAD_THRESH); + headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; + hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ + BUG_ON(wtype != rpcrdma_noch); + + } else { + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + /* + * Currently we try to not actually use read inline. + * Reply chunks have the desirable property that + * they land, packed, directly in the target buffers + * without headers, so they require no fixup. The + * additional RDMA Write op sends the same amount + * of data, streams on-the-wire and adds no overhead + * on receive. Therefore, we request a reply chunk + * for non-writes wherever feasible and efficient. + */ + if (wtype == rpcrdma_noch && + r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) + wtype = rpcrdma_replych; + } + } + + /* + * Marshal chunks. This routine will return the header length + * consumed by marshaling. + */ + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_snd_buf, headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_rcv_buf, headerp, wtype); + } + + if (hdrlen == 0) + return -1; + + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" + " headerp 0x%p base 0x%p lkey 0x%x\n", + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + headerp, base, req->rl_iov.lkey); + + /* + * initialize send_iov's - normally only two: rdma chunk header and + * single preregistered RPC header buffer, but if padding is present, + * then use a preregistered (and zeroed) pad buffer between the RPC + * header and any write data. In all non-rdma cases, any following + * data has been copied into the RPC header buffer. + */ + req->rl_send_iov[0].addr = req->rl_iov.addr; + req->rl_send_iov[0].length = hdrlen; + req->rl_send_iov[0].lkey = req->rl_iov.lkey; + + req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); + req->rl_send_iov[1].length = rpclen; + req->rl_send_iov[1].lkey = req->rl_iov.lkey; + + req->rl_niovs = 2; + + if (padlen) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + req->rl_send_iov[2].addr = ep->rep_pad.addr; + req->rl_send_iov[2].length = padlen; + req->rl_send_iov[2].lkey = ep->rep_pad.lkey; + + req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; + req->rl_send_iov[3].length = rqst->rq_slen - rpclen; + req->rl_send_iov[3].lkey = req->rl_iov.lkey; + + req->rl_niovs = 4; + } + + return 0; +} + +/* + * Chase down a received write or reply chunklist to get length + * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) + */ +static int +rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp) +{ + unsigned int i, total_len; + struct rpcrdma_write_chunk *cur_wchunk; + + i = ntohl(**iptrp); /* get array count */ + if (i > max) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); + total_len = 0; + while (i--) { + struct rpcrdma_segment *seg = &cur_wchunk->wc_target; + ifdebug(FACILITY) { + u64 off; + xdr_decode_hyper((u32 *)&seg->rs_offset, &off); + dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", + __func__, + ntohl(seg->rs_length), + off, + ntohl(seg->rs_handle)); + } + total_len += ntohl(seg->rs_length); + ++cur_wchunk; + } + /* check and adjust for properly terminated write chunk */ + if (wrchunk) { + u32 *w = (u32 *) cur_wchunk; + if (*w++ != xdr_zero) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) w; + } + if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) + return -1; + + *iptrp = (u32 *) cur_wchunk; + return total_len; +} + +/* + * Scatter inline received data back into provided iov's. + */ +static void +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +{ + int i, npages, curlen, olen; + char *destp; + + curlen = rqst->rq_rcv_buf.head[0].iov_len; + if (curlen > copy_len) { /* write chunk header fixup */ + curlen = copy_len; + rqst->rq_rcv_buf.head[0].iov_len = curlen; + } + + dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", + __func__, srcp, copy_len, curlen); + + /* Shift pointer for first receive segment only */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + srcp += curlen; + copy_len -= curlen; + + olen = copy_len; + i = 0; + rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; + if (copy_len && rqst->rq_rcv_buf.page_len) { + npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base + + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; + for (; i < npages; i++) { + if (i == 0) + curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base; + else + curlen = PAGE_SIZE; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d" + " srcp 0x%p len %d curlen %d\n", + __func__, i, srcp, copy_len, curlen); + destp = kmap_atomic(rqst->rq_rcv_buf.pages[i], + KM_SKB_SUNRPC_DATA); + if (i == 0) + memcpy(destp + rqst->rq_rcv_buf.page_base, + srcp, curlen); + else + memcpy(destp, srcp, curlen); + flush_dcache_page(rqst->rq_rcv_buf.pages[i]); + kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); + srcp += curlen; + copy_len -= curlen; + if (copy_len == 0) + break; + } + rqst->rq_rcv_buf.page_len = olen - copy_len; + } else + rqst->rq_rcv_buf.page_len = 0; + + if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { + curlen = copy_len; + if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) + curlen = rqst->rq_rcv_buf.tail[0].iov_len; + if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) + memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", + __func__, srcp, copy_len, curlen); + rqst->rq_rcv_buf.tail[0].iov_len = curlen; + copy_len -= curlen; ++i; + } else + rqst->rq_rcv_buf.tail[0].iov_len = 0; + + if (copy_len) + dprintk("RPC: %s: %d bytes in" + " %d extra segments (%d lost)\n", + __func__, olen, i, copy_len); + + /* TBD avoid a warning from call_decode() */ + rqst->rq_private_buf = rqst->rq_rcv_buf; +} + +/* + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. + */ +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + struct rpc_xprt *xprt = ep->rep_xprt; + + spin_lock_bh(&xprt->transport_lock); + if (ep->rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) + xprt_wake_pending_tasks(xprt, 0); + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, ep->rep_connected); + } + spin_unlock_bh(&xprt->transport_lock); +} + +/* + * This function is called when memory window unbind which we are waiting + * for completes. Just use rr_func (zeroed by upcall) to signal completion. + */ +static void +rpcrdma_unbind_func(struct rpcrdma_rep *rep) +{ + wake_up(&rep->rr_unbind); +} + +/* + * Called as a tasklet to do req/reply match and complete a request + * Errors must result in the RPC task either being awakened, or + * allowed to timeout, to discover the errors at that time. + */ +void +rpcrdma_reply_handler(struct rpcrdma_rep *rep) +{ + struct rpcrdma_msg *headerp; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct rpc_xprt *xprt = rep->rr_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + u32 *iptr; + int i, rdmalen, status; + + /* Check status. If bad, signal disconnect and return rep to pool */ + if (rep->rr_len == ~0U) { + rpcrdma_recv_buffer_put(rep); + if (r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -EIO; + rpcrdma_conn_func(&r_xprt->rx_ep); + } + return; + } + if (rep->rr_len < 28) { + dprintk("RPC: %s: short/invalid reply\n", __func__); + goto repost; + } + headerp = (struct rpcrdma_msg *) rep->rr_base; + if (headerp->rm_vers != xdr_one) { + dprintk("RPC: %s: invalid version %d\n", + __func__, ntohl(headerp->rm_vers)); + goto repost; + } + + /* Get XID and try for a match. */ + spin_lock(&xprt->transport_lock); + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); + if (rqst == NULL) { + spin_unlock(&xprt->transport_lock); + dprintk("RPC: %s: reply 0x%p failed " + "to match any request xid 0x%08x len %d\n", + __func__, rep, headerp->rm_xid, rep->rr_len); +repost: + r_xprt->rx_stats.bad_reply_count++; + rep->rr_func = rpcrdma_reply_handler; + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + rpcrdma_recv_buffer_put(rep); + + return; + } + + /* get request object */ + req = rpcr_to_rdmar(rqst); + + dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" + " RPC request 0x%p xid 0x%08x\n", + __func__, rep, req, rqst, headerp->rm_xid); + + BUG_ON(!req || req->rl_reply); + + /* from here on, the reply is no longer an orphan */ + req->rl_reply = rep; + + /* check for expected message types */ + /* The order of some of these tests is important. */ + switch (headerp->rm_type) { + case __constant_htonl(RDMA_MSG): + /* never expect read chunks */ + /* never expect reply chunks (two ways to check) */ + /* never expect write chunks without having offered RDMA */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + (headerp->rm_body.rm_chunks[1] == xdr_zero && + headerp->rm_body.rm_chunks[2] != xdr_zero) || + (headerp->rm_body.rm_chunks[1] != xdr_zero && + req->rl_nchunks == 0)) + goto badheader; + if (headerp->rm_body.rm_chunks[1] != xdr_zero) { + /* count any expected write chunks in read reply */ + /* start at write chunk array count */ + iptr = &headerp->rm_body.rm_chunks[2]; + rdmalen = rpcrdma_count_chunks(rep, + req->rl_nchunks, 1, &iptr); + /* check for validity, and no reply chunk after */ + if (rdmalen < 0 || *iptr++ != xdr_zero) + goto badheader; + rep->rr_len -= + ((unsigned char *)iptr - (unsigned char *)headerp); + status = rep->rr_len + rdmalen; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + } else { + /* else ordinary inline */ + iptr = (u32 *)((unsigned char *)headerp + 28); + rep->rr_len -= 28; /*sizeof *headerp;*/ + status = rep->rr_len; + } + /* Fix up the rpc results for upper layer */ + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + break; + + case __constant_htonl(RDMA_NOMSG): + /* never expect read or write chunks, always reply chunks */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + headerp->rm_body.rm_chunks[1] != xdr_zero || + headerp->rm_body.rm_chunks[2] != xdr_one || + req->rl_nchunks == 0) + goto badheader; + iptr = (u32 *)((unsigned char *)headerp + 28); + rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + if (rdmalen < 0) + goto badheader; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* Reply chunk buffer already is the reply vector - no fixup. */ + status = rdmalen; + break; + +badheader: + default: + dprintk("%s: invalid rpcrdma reply header (type %d):" + " chunks[012] == %d %d %d" + " expected chunks <= %d\n", + __func__, ntohl(headerp->rm_type), + headerp->rm_body.rm_chunks[0], + headerp->rm_body.rm_chunks[1], + headerp->rm_body.rm_chunks[2], + req->rl_nchunks); + status = -EIO; + r_xprt->rx_stats.bad_reply_count++; + break; + } + + /* If using mw bind, start the deregister process now. */ + /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ + if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_MEMWINDOWS: + for (i = 0; req->rl_nchunks-- > 1;) + i += rpcrdma_deregister_external( + &req->rl_segments[i], r_xprt, NULL); + /* Optionally wait (not here) for unbinds to complete */ + rep->rr_func = rpcrdma_unbind_func; + (void) rpcrdma_deregister_external(&req->rl_segments[i], + r_xprt, rep); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + for (i = 0; req->rl_nchunks--;) + i += rpcrdma_deregister_external(&req->rl_segments[i], + r_xprt, NULL); + break; + default: + break; + } + + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", + __func__, xprt, rqst, status); + xprt_complete_rqst(rqst->rq_task, status); + spin_unlock(&xprt->transport_lock); +} -- cgit v1.2.2 n591' href='#n591'>591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
/*
 *  linux/drivers/net/irda/sa1100_ir.c
 *
 *  Copyright (C) 2000-2001 Russell King
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 *  Infra-red driver for the StrongARM SA1100 embedded microprocessor
 *
 *  Note that we don't have to worry about the SA1111's DMA bugs in here,
 *  so we use the straight forward dma_map_* functions with a null pointer.
 *
 *  This driver takes one kernel command line parameter, sa1100ir=, with
 *  the following options:
 *	max_rate:baudrate	- set the maximum baud rate
 *	power_leve:level	- set the transmitter power level
 *	tx_lpm:0|1		- set transmit low power mode
 */
#include <linux/config.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/rtnetlink.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/dma-mapping.h>

#include <net/irda/irda.h>
#include <net/irda/wrapper.h>
#include <net/irda/irda_device.h>

#include <asm/irq.h>
#include <asm/dma.h>
#include <asm/hardware.h>
#include <asm/mach/irda.h>

static int power_level = 3;
static int tx_lpm;
static int max_rate = 4000000;

struct sa1100_irda {
	unsigned char		hscr0;
	unsigned char		utcr4;
	unsigned char		power;
	unsigned char		open;

	int			speed;
	int			newspeed;

	struct sk_buff		*txskb;
	struct sk_buff		*rxskb;
	dma_addr_t		txbuf_dma;
	dma_addr_t		rxbuf_dma;
	dma_regs_t		*txdma;
	dma_regs_t		*rxdma;

	struct net_device_stats	stats;
	struct device		*dev;
	struct irda_platform_data *pdata;
	struct irlap_cb		*irlap;
	struct qos_info		qos;

	iobuff_t		tx_buff;
	iobuff_t		rx_buff;
};

#define IS_FIR(si)		((si)->speed >= 4000000)

#define HPSIR_MAX_RXLEN		2047

/*
 * Allocate and map the receive buffer, unless it is already allocated.
 */
static int sa1100_irda_rx_alloc(struct sa1100_irda *si)
{
	if (si->rxskb)
		return 0;

	si->rxskb = alloc_skb(HPSIR_MAX_RXLEN + 1, GFP_ATOMIC);

	if (!si->rxskb) {
		printk(KERN_ERR "sa1100_ir: out of memory for RX SKB\n");
		return -ENOMEM;
	}

	/*
	 * Align any IP headers that may be contained
	 * within the frame.
	 */
	skb_reserve(si->rxskb, 1);

	si->rxbuf_dma = dma_map_single(si->dev, si->rxskb->data,
					HPSIR_MAX_RXLEN,
					DMA_FROM_DEVICE);
	return 0;
}

/*
 * We want to get here as soon as possible, and get the receiver setup.
 * We use the existing buffer.
 */
static void sa1100_irda_rx_dma_start(struct sa1100_irda *si)
{
	if (!si->rxskb) {
		printk(KERN_ERR "sa1100_ir: rx buffer went missing\n");
		return;
	}

	/*
	 * First empty receive FIFO
	 */
	Ser2HSCR0 = si->hscr0 | HSCR0_HSSP;

	/*
	 * Enable the DMA, receiver and receive interrupt.
	 */
	sa1100_clear_dma(si->rxdma);
	sa1100_start_dma(si->rxdma, si->rxbuf_dma, HPSIR_MAX_RXLEN);
	Ser2HSCR0 = si->hscr0 | HSCR0_HSSP | HSCR0_RXE;
}

/*
 * Set the IrDA communications speed.
 */
static int sa1100_irda_set_speed(struct sa1100_irda *si, int speed)
{
	unsigned long flags;
	int brd, ret = -EINVAL;

	switch (speed) {
	case 9600:	case 19200:	case 38400:
	case 57600:	case 115200:
		brd = 3686400 / (16 * speed) - 1;

		/*
		 * Stop the receive DMA.
		 */
		if (IS_FIR(si))
			sa1100_stop_dma(si->rxdma);

		local_irq_save(flags);

		Ser2UTCR3 = 0;
		Ser2HSCR0 = HSCR0_UART;

		Ser2UTCR1 = brd >> 8;
		Ser2UTCR2 = brd;

		/*
		 * Clear status register
		 */
		Ser2UTSR0 = UTSR0_REB | UTSR0_RBB | UTSR0_RID;
		Ser2UTCR3 = UTCR3_RIE | UTCR3_RXE | UTCR3_TXE;

		if (si->pdata->set_speed)
			si->pdata->set_speed(si->dev, speed);

		si->speed = speed;

		local_irq_restore(flags);
		ret = 0;
		break;

	case 4000000:
		local_irq_save(flags);

		si->hscr0 = 0;

		Ser2HSSR0 = 0xff;
		Ser2HSCR0 = si->hscr0 | HSCR0_HSSP;
		Ser2UTCR3 = 0;

		si->speed = speed;

		if (si->pdata->set_speed)
			si->pdata->set_speed(si->dev, speed);

		sa1100_irda_rx_alloc(si);
		sa1100_irda_rx_dma_start(si);

		local_irq_restore(flags);

		break;

	default:
		break;
	}

	return ret;
}

/*
 * Control the power state of the IrDA transmitter.
 * State:
 *  0 - off
 *  1 - short range, lowest power
 *  2 - medium range, medium power
 *  3 - maximum range, high power
 *
 * Currently, only assabet is known to support this.
 */
static int
__sa1100_irda_set_power(struct sa1100_irda *si, unsigned int state)
{
	int ret = 0;
	if (si->pdata->set_power)
		ret = si->pdata->set_power(si->dev, state);
	return ret;
}

static inline int
sa1100_set_power(struct sa1100_irda *si, unsigned int state)
{
	int ret;

	ret = __sa1100_irda_set_power(si, state);
	if (ret == 0)
		si->power = state;

	return ret;
}

static int sa1100_irda_startup(struct sa1100_irda *si)
{
	int ret;

	/*
	 * Ensure that the ports for this device are setup correctly.
	 */
	if (si->pdata->startup)
		si->pdata->startup(si->dev);

	/*
	 * Configure PPC for IRDA - we want to drive TXD2 low.
	 * We also want to drive this pin low during sleep.
	 */
	PPSR &= ~PPC_TXD2;
	PSDR &= ~PPC_TXD2;
	PPDR |= PPC_TXD2;

	/*
	 * Enable HP-SIR modulation, and ensure that the port is disabled.
	 */
	Ser2UTCR3 = 0;
	Ser2HSCR0 = HSCR0_UART;
	Ser2UTCR4 = si->utcr4;
	Ser2UTCR0 = UTCR0_8BitData;
	Ser2HSCR2 = HSCR2_TrDataH | HSCR2_RcDataL;

	/*
	 * Clear status register
	 */
	Ser2UTSR0 = UTSR0_REB | UTSR0_RBB | UTSR0_RID;

	ret = sa1100_irda_set_speed(si, si->speed = 9600);
	if (ret) {
		Ser2UTCR3 = 0;
		Ser2HSCR0 = 0;

		if (si->pdata->shutdown)
			si->pdata->shutdown(si->dev);
	}

	return ret;
}

static void sa1100_irda_shutdown(struct sa1100_irda *si)
{
	/*
	 * Stop all DMA activity.
	 */
	sa1100_stop_dma(si->rxdma);
	sa1100_stop_dma(si->txdma);

	/* Disable the port. */
	Ser2UTCR3 = 0;
	Ser2HSCR0 = 0;

	if (si->pdata->shutdown)
		si->pdata->shutdown(si->dev);
}

#ifdef CONFIG_PM
/*
 * Suspend the IrDA interface.
 */
static int sa1100_irda_suspend(struct platform_device *pdev, pm_message_t state)
{
	struct net_device *dev = platform_get_drvdata(pdev);
	struct sa1100_irda *si;

	if (!dev)
		return 0;

	si = dev->priv;
	if (si->open) {
		/*
		 * Stop the transmit queue
		 */
		netif_device_detach(dev);
		disable_irq(dev->irq);
		sa1100_irda_shutdown(si);
		__sa1100_irda_set_power(si, 0);
	}

	return 0;
}

/*
 * Resume the IrDA interface.
 */
static int sa1100_irda_resume(struct platform_device *pdev)
{
	struct net_device *dev = platform_get_drvdata(pdev);
	struct sa1100_irda *si;

	if (!dev)
		return 0;

	si = dev->priv;
	if (si->open) {
		/*
		 * If we missed a speed change, initialise at the new speed
		 * directly.  It is debatable whether this is actually
		 * required, but in the interests of continuing from where
		 * we left off it is desireable.  The converse argument is
		 * that we should re-negotiate at 9600 baud again.
		 */
		if (si->newspeed) {
			si->speed = si->newspeed;
			si->newspeed = 0;
		}

		sa1100_irda_startup(si);
		__sa1100_irda_set_power(si, si->power);
		enable_irq(dev->irq);

		/*
		 * This automatically wakes up the queue
		 */
		netif_device_attach(dev);
	}

	return 0;
}
#else
#define sa1100_irda_suspend	NULL
#define sa1100_irda_resume	NULL
#endif

/*
 * HP-SIR format interrupt service routines.
 */
static void sa1100_irda_hpsir_irq(struct net_device *dev)
{
	struct sa1100_irda *si = dev->priv;
	int status;

	status = Ser2UTSR0;

	/*
	 * Deal with any receive errors first.  The bytes in error may be
	 * the only bytes in the receive FIFO, so we do this first.
	 */
	while (status & UTSR0_EIF) {
		int stat, data;

		stat = Ser2UTSR1;
		data = Ser2UTDR;

		if (stat & (UTSR1_FRE | UTSR1_ROR)) {
			si->stats.rx_errors++;
			if (stat & UTSR1_FRE)
				si->stats.rx_frame_errors++;
			if (stat & UTSR1_ROR)
				si->stats.rx_fifo_errors++;
		} else
			async_unwrap_char(dev, &si->stats, &si->rx_buff, data);

		status = Ser2UTSR0;
	}

	/*
	 * We must clear certain bits.
	 */
	Ser2UTSR0 = status & (UTSR0_RID | UTSR0_RBB | UTSR0_REB);

	if (status & UTSR0_RFS) {
		/*
		 * There are at least 4 bytes in the FIFO.  Read 3 bytes
		 * and leave the rest to the block below.
		 */
		async_unwrap_char(dev, &si->stats, &si->rx_buff, Ser2UTDR);
		async_unwrap_char(dev, &si->stats, &si->rx_buff, Ser2UTDR);
		async_unwrap_char(dev, &si->stats, &si->rx_buff, Ser2UTDR);
	}

	if (status & (UTSR0_RFS | UTSR0_RID)) {
		/*
		 * Fifo contains more than 1 character.
		 */
		do {
			async_unwrap_char(dev, &si->stats, &si->rx_buff,
					  Ser2UTDR);
		} while (Ser2UTSR1 & UTSR1_RNE);

		dev->last_rx = jiffies;
	}

	if (status & UTSR0_TFS && si->tx_buff.len) {
		/*
		 * Transmitter FIFO is not full
		 */
		do {
			Ser2UTDR = *si->tx_buff.data++;
			si->tx_buff.len -= 1;
		} while (Ser2UTSR1 & UTSR1_TNF && si->tx_buff.len);

		if (si->tx_buff.len == 0) {
			si->stats.tx_packets++;
			si->stats.tx_bytes += si->tx_buff.data -
					      si->tx_buff.head;

			/*
			 * We need to ensure that the transmitter has
			 * finished.
			 */
			do
				rmb();
			while (Ser2UTSR1 & UTSR1_TBY);

			/*
			 * Ok, we've finished transmitting.  Now enable
			 * the receiver.  Sometimes we get a receive IRQ
			 * immediately after a transmit...
			 */
			Ser2UTSR0 = UTSR0_REB | UTSR0_RBB | UTSR0_RID;
			Ser2UTCR3 = UTCR3_RIE | UTCR3_RXE | UTCR3_TXE;

			if (si->newspeed) {
				sa1100_irda_set_speed(si, si->newspeed);
				si->newspeed = 0;
			}

			/* I'm hungry! */
			netif_wake_queue(dev);
		}
	}
}

static void sa1100_irda_fir_error(struct sa1100_irda *si, struct net_device *dev)
{
	struct sk_buff *skb = si->rxskb;
	dma_addr_t dma_addr;
	unsigned int len, stat, data;

	if (!skb) {
		printk(KERN_ERR "sa1100_ir: SKB is NULL!\n");
		return;
	}

	/*
	 * Get the current data position.
	 */
	dma_addr = sa1100_get_dma_pos(si->rxdma);
	len = dma_addr - si->rxbuf_dma;
	if (len > HPSIR_MAX_RXLEN)
		len = HPSIR_MAX_RXLEN;
	dma_unmap_single(si->dev, si->rxbuf_dma, len, DMA_FROM_DEVICE);

	do {
		/*
		 * Read Status, and then Data.
		 */
		stat = Ser2HSSR1;
		rmb();
		data = Ser2HSDR;

		if (stat & (HSSR1_CRE | HSSR1_ROR)) {
			si->stats.rx_errors++;
			if (stat & HSSR1_CRE)
				si->stats.rx_crc_errors++;
			if (stat & HSSR1_ROR)
				si->stats.rx_frame_errors++;
		} else
			skb->data[len++] = data;

		/*
		 * If we hit the end of frame, there's
		 * no point in continuing.
		 */
		if (stat & HSSR1_EOF)
			break;
	} while (Ser2HSSR0 & HSSR0_EIF);

	if (stat & HSSR1_EOF) {
		si->rxskb = NULL;

		skb_put(skb, len);
		skb->dev = dev;
		skb->mac.raw = skb->data;
		skb->protocol = htons(ETH_P_IRDA);
		si->stats.rx_packets++;
		si->stats.rx_bytes += len;

		/*
		 * Before we pass the buffer up, allocate a new one.
		 */
		sa1100_irda_rx_alloc(si);

		netif_rx(skb);
		dev->last_rx = jiffies;
	} else {
		/*
		 * Remap the buffer.
		 */
		si->rxbuf_dma = dma_map_single(si->dev, si->rxskb->data,
						HPSIR_MAX_RXLEN,
						DMA_FROM_DEVICE);
	}
}

/*
 * FIR format interrupt service routine.  We only have to
 * handle RX events; transmit events go via the TX DMA handler.
 *
 * No matter what, we disable RX, process, and the restart RX.
 */
static void sa1100_irda_fir_irq(struct net_device *dev)
{
	struct sa1100_irda *si = dev->priv;

	/*
	 * Stop RX DMA
	 */
	sa1100_stop_dma(si->rxdma);

	/*
	 * Framing error - we throw away the packet completely.
	 * Clearing RXE flushes the error conditions and data
	 * from the fifo.
	 */
	if (Ser2HSSR0 & (HSSR0_FRE | HSSR0_RAB)) {
		si->stats.rx_errors++;

		if (Ser2HSSR0 & HSSR0_FRE)
			si->stats.rx_frame_errors++;

		/*
		 * Clear out the DMA...
		 */
		Ser2HSCR0 = si->hscr0 | HSCR0_HSSP;

		/*
		 * Clear selected status bits now, so we
		 * don't miss them next time around.
		 */
		Ser2HSSR0 = HSSR0_FRE | HSSR0_RAB;
	}

	/*
	 * Deal with any receive errors.  The any of the lowest
	 * 8 bytes in the FIFO may contain an error.  We must read
	 * them one by one.  The "error" could even be the end of
	 * packet!
	 */
	if (Ser2HSSR0 & HSSR0_EIF)
		sa1100_irda_fir_error(si, dev);

	/*
	 * No matter what happens, we must restart reception.
	 */
	sa1100_irda_rx_dma_start(si);
}

static irqreturn_t sa1100_irda_irq(int irq, void *dev_id, struct pt_regs *regs)
{
	struct net_device *dev = dev_id;
	if (IS_FIR(((struct sa1100_irda *)dev->priv)))
		sa1100_irda_fir_irq(dev);
	else
		sa1100_irda_hpsir_irq(dev);
	return IRQ_HANDLED;
}

/*
 * TX DMA completion handler.
 */
static void sa1100_irda_txdma_irq(void *id)
{
	struct net_device *dev = id;
	struct sa1100_irda *si = dev->priv;
	struct sk_buff *skb = si->txskb;

	si->txskb = NULL;

	/*
	 * Wait for the transmission to complete.  Unfortunately,
	 * the hardware doesn't give us an interrupt to indicate
	 * "end of frame".
	 */
	do
		rmb();
	while (!(Ser2HSSR0 & HSSR0_TUR) || Ser2HSSR1 & HSSR1_TBY);

	/*
	 * Clear the transmit underrun bit.
	 */
	Ser2HSSR0 = HSSR0_TUR;

	/*
	 * Do we need to change speed?  Note that we're lazy
	 * here - we don't free the old rxskb.  We don't need
	 * to allocate a buffer either.
	 */
	if (si->newspeed) {
		sa1100_irda_set_speed(si, si->newspeed);
		si->newspeed = 0;
	}

	/*
	 * Start reception.  This disables the transmitter for
	 * us.  This will be using the existing RX buffer.
	 */
	sa1100_irda_rx_dma_start(si);

	/*
	 * Account and free the packet.
	 */
	if (skb) {
		dma_unmap_single(si->dev, si->txbuf_dma, skb->len, DMA_TO_DEVICE);
		si->stats.tx_packets ++;
		si->stats.tx_bytes += skb->len;
		dev_kfree_skb_irq(skb);
	}

	/*
	 * Make sure that the TX queue is available for sending
	 * (for retries).  TX has priority over RX at all times.
	 */
	netif_wake_queue(dev);
}

static int sa1100_irda_hard_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct sa1100_irda *si = dev->priv;
	int speed = irda_get_next_speed(skb);

	/*
	 * Does this packet contain a request to change the interface
	 * speed?  If so, remember it until we complete the transmission
	 * of this frame.
	 */
	if (speed != si->speed && speed != -1)
		si->newspeed = speed;

	/*
	 * If this is an empty frame, we can bypass a lot.
	 */
	if (skb->len == 0) {
		if (si->newspeed) {
			si->newspeed = 0;
			sa1100_irda_set_speed(si, speed);
		}
		dev_kfree_skb(skb);
		return 0;
	}

	if (!IS_FIR(si)) {
		netif_stop_queue(dev);

		si->tx_buff.data = si->tx_buff.head;
		si->tx_buff.len  = async_wrap_skb(skb, si->tx_buff.data,
						  si->tx_buff.truesize);

		/*
		 * Set the transmit interrupt enable.  This will fire
		 * off an interrupt immediately.  Note that we disable
		 * the receiver so we won't get spurious characteres
		 * received.
		 */
		Ser2UTCR3 = UTCR3_TIE | UTCR3_TXE;

		dev_kfree_skb(skb);
	} else {
		int mtt = irda_get_mtt(skb);

		/*
		 * We must not be transmitting...
		 */
		BUG_ON(si->txskb);

		netif_stop_queue(dev);

		si->txskb = skb;
		si->txbuf_dma = dma_map_single(si->dev, skb->data,
					 skb->len, DMA_TO_DEVICE);

		sa1100_start_dma(si->txdma, si->txbuf_dma, skb->len);

		/*
		 * If we have a mean turn-around time, impose the specified
		 * specified delay.  We could shorten this by timing from
		 * the point we received the packet.
		 */
		if (mtt)
			udelay(mtt);

		Ser2HSCR0 = si->hscr0 | HSCR0_HSSP | HSCR0_TXE;
	}

	dev->trans_start = jiffies;

	return 0;
}

static int
sa1100_irda_ioctl(struct net_device *dev, struct ifreq *ifreq, int cmd)
{
	struct if_irda_req *rq = (struct if_irda_req *)ifreq;
	struct sa1100_irda *si = dev->priv;
	int ret = -EOPNOTSUPP;

	switch (cmd) {
	case SIOCSBANDWIDTH:
		if (capable(CAP_NET_ADMIN)) {
			/*
			 * We are unable to set the speed if the
			 * device is not running.
			 */
			if (si->open) {
				ret = sa1100_irda_set_speed(si,
						rq->ifr_baudrate);
			} else {
				printk("sa1100_irda_ioctl: SIOCSBANDWIDTH: !netif_running\n");
				ret = 0;
			}
		}
		break;

	case SIOCSMEDIABUSY:
		ret = -EPERM;
		if (capable(CAP_NET_ADMIN)) {
			irda_device_set_media_busy(dev, TRUE);
			ret = 0;
		}
		break;

	case SIOCGRECEIVING:
		rq->ifr_receiving = IS_FIR(si) ? 0
					: si->rx_buff.state != OUTSIDE_FRAME;
		break;

	default:
		break;
	}
		
	return ret;
}

static struct net_device_stats *sa1100_irda_stats(struct net_device *dev)
{
	struct sa1100_irda *si = dev->priv;
	return &si->stats;
}

static int sa1100_irda_start(struct net_device *dev)
{
	struct sa1100_irda *si = dev->priv;
	int err;

	si->speed = 9600;

	err = request_irq(dev->irq, sa1100_irda_irq, 0, dev->name, dev);
	if (err)
		goto err_irq;

	err = sa1100_request_dma(DMA_Ser2HSSPRd, "IrDA receive",
				 NULL, NULL, &si->rxdma);
	if (err)
		goto err_rx_dma;

	err = sa1100_request_dma(DMA_Ser2HSSPWr, "IrDA transmit",
				 sa1100_irda_txdma_irq, dev, &si->txdma);
	if (err)
		goto err_tx_dma;

	/*
	 * The interrupt must remain disabled for now.
	 */
	disable_irq(dev->irq);

	/*
	 * Setup the serial port for the specified speed.
	 */
	err = sa1100_irda_startup(si);
	if (err)
		goto err_startup;

	/*
	 * Open a new IrLAP layer instance.
	 */
	si->irlap = irlap_open(dev, &si->qos, "sa1100");
	err = -ENOMEM;
	if (!si->irlap)
		goto err_irlap;

	/*
	 * Now enable the interrupt and start the queue
	 */
	si->open = 1;
	sa1100_set_power(si, power_level); /* low power mode */
	enable_irq(dev->irq);
	netif_start_queue(dev);
	return 0;

err_irlap:
	si->open = 0;
	sa1100_irda_shutdown(si);
err_startup:
	sa1100_free_dma(si->txdma);
err_tx_dma:
	sa1100_free_dma(si->rxdma);
err_rx_dma:
	free_irq(dev->irq, dev);
err_irq:
	return err;
}

static int sa1100_irda_stop(struct net_device *dev)
{
	struct sa1100_irda *si = dev->priv;

	disable_irq(dev->irq);
	sa1100_irda_shutdown(si);

	/*
	 * If we have been doing DMA receive, make sure we
	 * tidy that up cleanly.
	 */
	if (si->rxskb) {
		dma_unmap_single(si->dev, si->rxbuf_dma, HPSIR_MAX_RXLEN,
				 DMA_FROM_DEVICE);
		dev_kfree_skb(si->rxskb);
		si->rxskb = NULL;
	}

	/* Stop IrLAP */
	if (si->irlap) {
		irlap_close(si->irlap);
		si->irlap = NULL;
	}

	netif_stop_queue(dev);
	si->open = 0;

	/*
	 * Free resources
	 */
	sa1100_free_dma(si->txdma);
	sa1100_free_dma(si->rxdma);
	free_irq(dev->irq, dev);

	sa1100_set_power(si, 0);

	return 0;
}

static int sa1100_irda_init_iobuf(iobuff_t *io, int size)
{
	io->head = kmalloc(size, GFP_KERNEL | GFP_DMA);
	if (io->head != NULL) {
		io->truesize = size;
		io->in_frame = FALSE;
		io->state    = OUTSIDE_FRAME;
		io->data     = io->head;
	}
	return io->head ? 0 : -ENOMEM;
}

static int sa1100_irda_probe(struct platform_device *pdev)
{
	struct net_device *dev;
	struct sa1100_irda *si;
	unsigned int baudrate_mask;
	int err;

	if (!pdev->dev.platform_data)
		return -EINVAL;

	err = request_mem_region(__PREG(Ser2UTCR0), 0x24, "IrDA") ? 0 : -EBUSY;
	if (err)
		goto err_mem_1;
	err = request_mem_region(__PREG(Ser2HSCR0), 0x1c, "IrDA") ? 0 : -EBUSY;
	if (err)
		goto err_mem_2;
	err = request_mem_region(__PREG(Ser2HSCR2), 0x04, "IrDA") ? 0 : -EBUSY;
	if (err)
		goto err_mem_3;

	dev = alloc_irdadev(sizeof(struct sa1100_irda));
	if (!dev)
		goto err_mem_4;

	si = dev->priv;
	si->dev = &pdev->dev;
	si->pdata = pdev->dev.platform_data;

	/*
	 * Initialise the HP-SIR buffers
	 */
	err = sa1100_irda_init_iobuf(&si->rx_buff, 14384);
	if (err)
		goto err_mem_5;
	err = sa1100_irda_init_iobuf(&si->tx_buff, 4000);
	if (err)
		goto err_mem_5;

	dev->hard_start_xmit	= sa1100_irda_hard_xmit;
	dev->open		= sa1100_irda_start;
	dev->stop		= sa1100_irda_stop;
	dev->do_ioctl		= sa1100_irda_ioctl;
	dev->get_stats		= sa1100_irda_stats;
	dev->irq		= IRQ_Ser2ICP;

	irda_init_max_qos_capabilies(&si->qos);

	/*
	 * We support original IRDA up to 115k2. (we don't currently
	 * support 4Mbps).  Min Turn Time set to 1ms or greater.
	 */
	baudrate_mask = IR_9600;

	switch (max_rate) {
	case 4000000:		baudrate_mask |= IR_4000000 << 8;
	case 115200:		baudrate_mask |= IR_115200;
	case 57600:		baudrate_mask |= IR_57600;
	case 38400:		baudrate_mask |= IR_38400;
	case 19200:		baudrate_mask |= IR_19200;
	}
		
	si->qos.baud_rate.bits &= baudrate_mask;
	si->qos.min_turn_time.bits = 7;

	irda_qos_bits_to_value(&si->qos);

	si->utcr4 = UTCR4_HPSIR;
	if (tx_lpm)
		si->utcr4 |= UTCR4_Z1_6us;

	/*
	 * Initially enable HP-SIR modulation, and ensure that the port
	 * is disabled.
	 */
	Ser2UTCR3 = 0;
	Ser2UTCR4 = si->utcr4;
	Ser2HSCR0 = HSCR0_UART;

	err = register_netdev(dev);
	if (err == 0)
		platform_set_drvdata(pdev, dev);

	if (err) {
 err_mem_5:
		kfree(si->tx_buff.head);
		kfree(si->rx_buff.head);
		free_netdev(dev);
 err_mem_4:
		release_mem_region(__PREG(Ser2HSCR2), 0x04);
 err_mem_3:
		release_mem_region(__PREG(Ser2HSCR0), 0x1c);
 err_mem_2:
		release_mem_region(__PREG(Ser2UTCR0), 0x24);
	}
 err_mem_1:
	return err;
}

static int sa1100_irda_remove(struct platform_device *pdev)
{
	struct net_device *dev = platform_get_drvdata(pdev);