diff options
| author | David S. Miller <davem@davemloft.net> | 2010-09-09 17:58:11 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2010-09-09 17:58:11 -0400 |
| commit | cf0ac2b8a759fecbefd80f890c6dbe80ba65fd95 (patch) | |
| tree | d5feaeef197dc681d1cdab2e8070ac31f0c43141 | |
| parent | f27e21a813e2c4ca74b30a5443602e75b146db9b (diff) | |
| parent | 905d64c89e2a9d71d0606904b7c3908633db6072 (diff) | |
Merge branch 'for-davem' of git://oss.oracle.com/git/agrover/linux-2.6
42 files changed, 2614 insertions, 1613 deletions
diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 626b629429ff..c7fbf298ad68 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild | |||
| @@ -302,6 +302,7 @@ header-y += quota.h | |||
| 302 | header-y += radeonfb.h | 302 | header-y += radeonfb.h |
| 303 | header-y += random.h | 303 | header-y += random.h |
| 304 | header-y += raw.h | 304 | header-y += raw.h |
| 305 | header-y += rds.h | ||
| 305 | header-y += reboot.h | 306 | header-y += reboot.h |
| 306 | header-y += reiserfs_fs.h | 307 | header-y += reiserfs_fs.h |
| 307 | header-y += reiserfs_xattr.h | 308 | header-y += reiserfs_xattr.h |
diff --git a/include/linux/rds.h b/include/linux/rds.h index 7f3971d9fc5c..91950950aa59 100644 --- a/include/linux/rds.h +++ b/include/linux/rds.h | |||
| @@ -73,6 +73,10 @@ | |||
| 73 | #define RDS_CMSG_RDMA_MAP 3 | 73 | #define RDS_CMSG_RDMA_MAP 3 |
| 74 | #define RDS_CMSG_RDMA_STATUS 4 | 74 | #define RDS_CMSG_RDMA_STATUS 4 |
| 75 | #define RDS_CMSG_CONG_UPDATE 5 | 75 | #define RDS_CMSG_CONG_UPDATE 5 |
| 76 | #define RDS_CMSG_ATOMIC_FADD 6 | ||
| 77 | #define RDS_CMSG_ATOMIC_CSWP 7 | ||
| 78 | #define RDS_CMSG_MASKED_ATOMIC_FADD 8 | ||
| 79 | #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 | ||
| 76 | 80 | ||
| 77 | #define RDS_INFO_FIRST 10000 | 81 | #define RDS_INFO_FIRST 10000 |
| 78 | #define RDS_INFO_COUNTERS 10000 | 82 | #define RDS_INFO_COUNTERS 10000 |
| @@ -89,9 +93,9 @@ | |||
| 89 | #define RDS_INFO_LAST 10010 | 93 | #define RDS_INFO_LAST 10010 |
| 90 | 94 | ||
| 91 | struct rds_info_counter { | 95 | struct rds_info_counter { |
| 92 | u_int8_t name[32]; | 96 | uint8_t name[32]; |
| 93 | u_int64_t value; | 97 | uint64_t value; |
| 94 | } __packed; | 98 | } __attribute__((packed)); |
| 95 | 99 | ||
| 96 | #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 | 100 | #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 |
| 97 | #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 | 101 | #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 |
| @@ -100,56 +104,48 @@ struct rds_info_counter { | |||
| 100 | #define TRANSNAMSIZ 16 | 104 | #define TRANSNAMSIZ 16 |
| 101 | 105 | ||
| 102 | struct rds_info_connection { | 106 | struct rds_info_connection { |
| 103 | u_int64_t next_tx_seq; | 107 | uint64_t next_tx_seq; |
| 104 | u_int64_t next_rx_seq; | 108 | uint64_t next_rx_seq; |
| 105 | __be32 laddr; | 109 | __be32 laddr; |
| 106 | __be32 faddr; | 110 | __be32 faddr; |
| 107 | u_int8_t transport[TRANSNAMSIZ]; /* null term ascii */ | 111 | uint8_t transport[TRANSNAMSIZ]; /* null term ascii */ |
| 108 | u_int8_t flags; | 112 | uint8_t flags; |
| 109 | } __packed; | 113 | } __attribute__((packed)); |
| 110 | |||
| 111 | struct rds_info_flow { | ||
| 112 | __be32 laddr; | ||
| 113 | __be32 faddr; | ||
| 114 | u_int32_t bytes; | ||
| 115 | __be16 lport; | ||
| 116 | __be16 fport; | ||
| 117 | } __packed; | ||
| 118 | 114 | ||
| 119 | #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 | 115 | #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 |
| 120 | #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 | 116 | #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 |
| 121 | 117 | ||
| 122 | struct rds_info_message { | 118 | struct rds_info_message { |
| 123 | u_int64_t seq; | 119 | uint64_t seq; |
| 124 | u_int32_t len; | 120 | uint32_t len; |
| 125 | __be32 laddr; | 121 | __be32 laddr; |
| 126 | __be32 faddr; | 122 | __be32 faddr; |
| 127 | __be16 lport; | 123 | __be16 lport; |
| 128 | __be16 fport; | 124 | __be16 fport; |
| 129 | u_int8_t flags; | 125 | uint8_t flags; |
| 130 | } __packed; | 126 | } __attribute__((packed)); |
| 131 | 127 | ||
| 132 | struct rds_info_socket { | 128 | struct rds_info_socket { |
| 133 | u_int32_t sndbuf; | 129 | uint32_t sndbuf; |
| 134 | __be32 bound_addr; | 130 | __be32 bound_addr; |
| 135 | __be32 connected_addr; | 131 | __be32 connected_addr; |
| 136 | __be16 bound_port; | 132 | __be16 bound_port; |
| 137 | __be16 connected_port; | 133 | __be16 connected_port; |
| 138 | u_int32_t rcvbuf; | 134 | uint32_t rcvbuf; |
| 139 | u_int64_t inum; | 135 | uint64_t inum; |
| 140 | } __packed; | 136 | } __attribute__((packed)); |
| 141 | 137 | ||
| 142 | struct rds_info_tcp_socket { | 138 | struct rds_info_tcp_socket { |
| 143 | __be32 local_addr; | 139 | __be32 local_addr; |
| 144 | __be16 local_port; | 140 | __be16 local_port; |
| 145 | __be32 peer_addr; | 141 | __be32 peer_addr; |
| 146 | __be16 peer_port; | 142 | __be16 peer_port; |
| 147 | u_int64_t hdr_rem; | 143 | uint64_t hdr_rem; |
| 148 | u_int64_t data_rem; | 144 | uint64_t data_rem; |
| 149 | u_int32_t last_sent_nxt; | 145 | uint32_t last_sent_nxt; |
| 150 | u_int32_t last_expected_una; | 146 | uint32_t last_expected_una; |
| 151 | u_int32_t last_seen_una; | 147 | uint32_t last_seen_una; |
| 152 | } __packed; | 148 | } __attribute__((packed)); |
| 153 | 149 | ||
| 154 | #define RDS_IB_GID_LEN 16 | 150 | #define RDS_IB_GID_LEN 16 |
| 155 | struct rds_info_rdma_connection { | 151 | struct rds_info_rdma_connection { |
| @@ -203,42 +199,69 @@ struct rds_info_rdma_connection { | |||
| 203 | * (so that the application does not have to worry about | 199 | * (so that the application does not have to worry about |
| 204 | * alignment). | 200 | * alignment). |
| 205 | */ | 201 | */ |
| 206 | typedef u_int64_t rds_rdma_cookie_t; | 202 | typedef uint64_t rds_rdma_cookie_t; |
| 207 | 203 | ||
| 208 | struct rds_iovec { | 204 | struct rds_iovec { |
| 209 | u_int64_t addr; | 205 | uint64_t addr; |
| 210 | u_int64_t bytes; | 206 | uint64_t bytes; |
| 211 | }; | 207 | }; |
| 212 | 208 | ||
| 213 | struct rds_get_mr_args { | 209 | struct rds_get_mr_args { |
| 214 | struct rds_iovec vec; | 210 | struct rds_iovec vec; |
| 215 | u_int64_t cookie_addr; | 211 | uint64_t cookie_addr; |
| 216 | uint64_t flags; | 212 | uint64_t flags; |
| 217 | }; | 213 | }; |
| 218 | 214 | ||
| 219 | struct rds_get_mr_for_dest_args { | 215 | struct rds_get_mr_for_dest_args { |
| 220 | struct sockaddr_storage dest_addr; | 216 | struct sockaddr_storage dest_addr; |
| 221 | struct rds_iovec vec; | 217 | struct rds_iovec vec; |
| 222 | u_int64_t cookie_addr; | 218 | uint64_t cookie_addr; |
| 223 | uint64_t flags; | 219 | uint64_t flags; |
| 224 | }; | 220 | }; |
| 225 | 221 | ||
| 226 | struct rds_free_mr_args { | 222 | struct rds_free_mr_args { |
| 227 | rds_rdma_cookie_t cookie; | 223 | rds_rdma_cookie_t cookie; |
| 228 | u_int64_t flags; | 224 | uint64_t flags; |
| 229 | }; | 225 | }; |
| 230 | 226 | ||
| 231 | struct rds_rdma_args { | 227 | struct rds_rdma_args { |
| 232 | rds_rdma_cookie_t cookie; | 228 | rds_rdma_cookie_t cookie; |
| 233 | struct rds_iovec remote_vec; | 229 | struct rds_iovec remote_vec; |
| 234 | u_int64_t local_vec_addr; | 230 | uint64_t local_vec_addr; |
| 235 | u_int64_t nr_local; | 231 | uint64_t nr_local; |
| 236 | u_int64_t flags; | 232 | uint64_t flags; |
| 237 | u_int64_t user_token; | 233 | uint64_t user_token; |
| 234 | }; | ||
| 235 | |||
| 236 | struct rds_atomic_args { | ||
| 237 | rds_rdma_cookie_t cookie; | ||
| 238 | uint64_t local_addr; | ||
| 239 | uint64_t remote_addr; | ||
| 240 | union { | ||
| 241 | struct { | ||
| 242 | uint64_t compare; | ||
| 243 | uint64_t swap; | ||
| 244 | } cswp; | ||
| 245 | struct { | ||
| 246 | uint64_t add; | ||
| 247 | } fadd; | ||
| 248 | struct { | ||
| 249 | uint64_t compare; | ||
| 250 | uint64_t swap; | ||
| 251 | uint64_t compare_mask; | ||
| 252 | uint64_t swap_mask; | ||
| 253 | } m_cswp; | ||
| 254 | struct { | ||
| 255 | uint64_t add; | ||
| 256 | uint64_t nocarry_mask; | ||
| 257 | } m_fadd; | ||
| 258 | }; | ||
| 259 | uint64_t flags; | ||
| 260 | uint64_t user_token; | ||
| 238 | }; | 261 | }; |
| 239 | 262 | ||
| 240 | struct rds_rdma_notify { | 263 | struct rds_rdma_notify { |
| 241 | u_int64_t user_token; | 264 | uint64_t user_token; |
| 242 | int32_t status; | 265 | int32_t status; |
| 243 | }; | 266 | }; |
| 244 | 267 | ||
| @@ -257,5 +280,6 @@ struct rds_rdma_notify { | |||
| 257 | #define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */ | 280 | #define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */ |
| 258 | #define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ | 281 | #define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ |
| 259 | #define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ | 282 | #define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ |
| 283 | #define RDS_RDMA_SILENT 0x0040 /* Do not interrupt remote */ | ||
| 260 | 284 | ||
| 261 | #endif /* IB_RDS_H */ | 285 | #endif /* IB_RDS_H */ |
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index aebfecbdb841..bb6ad81b671d 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c | |||
| @@ -39,7 +39,15 @@ | |||
| 39 | #include <net/sock.h> | 39 | #include <net/sock.h> |
| 40 | 40 | ||
| 41 | #include "rds.h" | 41 | #include "rds.h" |
| 42 | #include "rdma.h" | 42 | |
| 43 | char *rds_str_array(char **array, size_t elements, size_t index) | ||
| 44 | { | ||
| 45 | if ((index < elements) && array[index]) | ||
| 46 | return array[index]; | ||
| 47 | else | ||
| 48 | return "unknown"; | ||
| 49 | } | ||
| 50 | EXPORT_SYMBOL(rds_str_array); | ||
| 43 | 51 | ||
| 44 | /* this is just used for stats gathering :/ */ | 52 | /* this is just used for stats gathering :/ */ |
| 45 | static DEFINE_SPINLOCK(rds_sock_lock); | 53 | static DEFINE_SPINLOCK(rds_sock_lock); |
| @@ -62,7 +70,7 @@ static int rds_release(struct socket *sock) | |||
| 62 | struct rds_sock *rs; | 70 | struct rds_sock *rs; |
| 63 | unsigned long flags; | 71 | unsigned long flags; |
| 64 | 72 | ||
| 65 | if (sk == NULL) | 73 | if (!sk) |
| 66 | goto out; | 74 | goto out; |
| 67 | 75 | ||
| 68 | rs = rds_sk_to_rs(sk); | 76 | rs = rds_sk_to_rs(sk); |
| @@ -73,7 +81,15 @@ static int rds_release(struct socket *sock) | |||
| 73 | * with the socket. */ | 81 | * with the socket. */ |
| 74 | rds_clear_recv_queue(rs); | 82 | rds_clear_recv_queue(rs); |
| 75 | rds_cong_remove_socket(rs); | 83 | rds_cong_remove_socket(rs); |
| 84 | |||
| 85 | /* | ||
| 86 | * the binding lookup hash uses rcu, we need to | ||
| 87 | * make sure we sychronize_rcu before we free our | ||
| 88 | * entry | ||
| 89 | */ | ||
| 76 | rds_remove_bound(rs); | 90 | rds_remove_bound(rs); |
| 91 | synchronize_rcu(); | ||
| 92 | |||
| 77 | rds_send_drop_to(rs, NULL); | 93 | rds_send_drop_to(rs, NULL); |
| 78 | rds_rdma_drop_keys(rs); | 94 | rds_rdma_drop_keys(rs); |
| 79 | rds_notify_queue_get(rs, NULL); | 95 | rds_notify_queue_get(rs, NULL); |
| @@ -83,6 +99,8 @@ static int rds_release(struct socket *sock) | |||
| 83 | rds_sock_count--; | 99 | rds_sock_count--; |
| 84 | spin_unlock_irqrestore(&rds_sock_lock, flags); | 100 | spin_unlock_irqrestore(&rds_sock_lock, flags); |
| 85 | 101 | ||
| 102 | rds_trans_put(rs->rs_transport); | ||
| 103 | |||
| 86 | sock->sk = NULL; | 104 | sock->sk = NULL; |
| 87 | sock_put(sk); | 105 | sock_put(sk); |
| 88 | out: | 106 | out: |
| @@ -514,7 +532,7 @@ out: | |||
| 514 | spin_unlock_irqrestore(&rds_sock_lock, flags); | 532 | spin_unlock_irqrestore(&rds_sock_lock, flags); |
| 515 | } | 533 | } |
| 516 | 534 | ||
| 517 | static void __exit rds_exit(void) | 535 | static void rds_exit(void) |
| 518 | { | 536 | { |
| 519 | sock_unregister(rds_family_ops.family); | 537 | sock_unregister(rds_family_ops.family); |
| 520 | proto_unregister(&rds_proto); | 538 | proto_unregister(&rds_proto); |
| @@ -529,7 +547,7 @@ static void __exit rds_exit(void) | |||
| 529 | } | 547 | } |
| 530 | module_exit(rds_exit); | 548 | module_exit(rds_exit); |
| 531 | 549 | ||
| 532 | static int __init rds_init(void) | 550 | static int rds_init(void) |
| 533 | { | 551 | { |
| 534 | int ret; | 552 | int ret; |
| 535 | 553 | ||
diff --git a/net/rds/bind.c b/net/rds/bind.c index 5d95fc007f1a..2f6b3fcc79f8 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c | |||
| @@ -34,45 +34,52 @@ | |||
| 34 | #include <net/sock.h> | 34 | #include <net/sock.h> |
| 35 | #include <linux/in.h> | 35 | #include <linux/in.h> |
| 36 | #include <linux/if_arp.h> | 36 | #include <linux/if_arp.h> |
| 37 | #include <linux/jhash.h> | ||
| 37 | #include "rds.h" | 38 | #include "rds.h" |
| 38 | 39 | ||
| 39 | /* | 40 | #define BIND_HASH_SIZE 1024 |
| 40 | * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't | 41 | static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; |
| 41 | * particularly zippy. | ||
| 42 | * | ||
| 43 | * This is now called for every incoming frame so we arguably care much more | ||
| 44 | * about it than we used to. | ||
| 45 | */ | ||
| 46 | static DEFINE_SPINLOCK(rds_bind_lock); | 42 | static DEFINE_SPINLOCK(rds_bind_lock); |
| 47 | static struct rb_root rds_bind_tree = RB_ROOT; | ||
| 48 | 43 | ||
| 49 | static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | 44 | static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) |
| 50 | struct rds_sock *insert) | 45 | { |
| 46 | return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & | ||
| 47 | (BIND_HASH_SIZE - 1)); | ||
| 48 | } | ||
| 49 | |||
| 50 | static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, | ||
| 51 | struct rds_sock *insert) | ||
| 51 | { | 52 | { |
| 52 | struct rb_node **p = &rds_bind_tree.rb_node; | ||
| 53 | struct rb_node *parent = NULL; | ||
| 54 | struct rds_sock *rs; | 53 | struct rds_sock *rs; |
| 54 | struct hlist_node *node; | ||
| 55 | struct hlist_head *head = hash_to_bucket(addr, port); | ||
| 55 | u64 cmp; | 56 | u64 cmp; |
| 56 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); | 57 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); |
| 57 | 58 | ||
| 58 | while (*p) { | 59 | rcu_read_lock(); |
| 59 | parent = *p; | 60 | hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { |
| 60 | rs = rb_entry(parent, struct rds_sock, rs_bound_node); | ||
| 61 | |||
| 62 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | | 61 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | |
| 63 | be16_to_cpu(rs->rs_bound_port); | 62 | be16_to_cpu(rs->rs_bound_port); |
| 64 | 63 | ||
| 65 | if (needle < cmp) | 64 | if (cmp == needle) { |
| 66 | p = &(*p)->rb_left; | 65 | rcu_read_unlock(); |
| 67 | else if (needle > cmp) | ||
| 68 | p = &(*p)->rb_right; | ||
| 69 | else | ||
| 70 | return rs; | 66 | return rs; |
| 67 | } | ||
| 71 | } | 68 | } |
| 69 | rcu_read_unlock(); | ||
| 72 | 70 | ||
| 73 | if (insert) { | 71 | if (insert) { |
| 74 | rb_link_node(&insert->rs_bound_node, parent, p); | 72 | /* |
| 75 | rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); | 73 | * make sure our addr and port are set before |
| 74 | * we are added to the list, other people | ||
| 75 | * in rcu will find us as soon as the | ||
| 76 | * hlist_add_head_rcu is done | ||
| 77 | */ | ||
| 78 | insert->rs_bound_addr = addr; | ||
| 79 | insert->rs_bound_port = port; | ||
| 80 | rds_sock_addref(insert); | ||
| 81 | |||
| 82 | hlist_add_head_rcu(&insert->rs_bound_node, head); | ||
| 76 | } | 83 | } |
| 77 | return NULL; | 84 | return NULL; |
| 78 | } | 85 | } |
| @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | |||
| 86 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) | 93 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) |
| 87 | { | 94 | { |
| 88 | struct rds_sock *rs; | 95 | struct rds_sock *rs; |
| 89 | unsigned long flags; | ||
| 90 | 96 | ||
| 91 | spin_lock_irqsave(&rds_bind_lock, flags); | 97 | rs = rds_bind_lookup(addr, port, NULL); |
| 92 | rs = rds_bind_tree_walk(addr, port, NULL); | 98 | |
| 93 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) | 99 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) |
| 94 | rds_sock_addref(rs); | 100 | rds_sock_addref(rs); |
| 95 | else | 101 | else |
| 96 | rs = NULL; | 102 | rs = NULL; |
| 97 | spin_unlock_irqrestore(&rds_bind_lock, flags); | ||
| 98 | 103 | ||
| 99 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, | 104 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, |
| 100 | ntohs(port)); | 105 | ntohs(port)); |
| @@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) | |||
| 121 | do { | 126 | do { |
| 122 | if (rover == 0) | 127 | if (rover == 0) |
| 123 | rover++; | 128 | rover++; |
| 124 | if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { | 129 | if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { |
| 125 | *port = cpu_to_be16(rover); | 130 | *port = rs->rs_bound_port; |
| 126 | ret = 0; | 131 | ret = 0; |
| 132 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
| 133 | rs, &addr, (int)ntohs(*port)); | ||
| 127 | break; | 134 | break; |
| 128 | } | 135 | } |
| 129 | } while (rover++ != last); | 136 | } while (rover++ != last); |
| 130 | 137 | ||
| 131 | if (ret == 0) { | ||
| 132 | rs->rs_bound_addr = addr; | ||
| 133 | rs->rs_bound_port = *port; | ||
| 134 | rds_sock_addref(rs); | ||
| 135 | |||
| 136 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
| 137 | rs, &addr, (int)ntohs(*port)); | ||
| 138 | } | ||
| 139 | |||
| 140 | spin_unlock_irqrestore(&rds_bind_lock, flags); | 138 | spin_unlock_irqrestore(&rds_bind_lock, flags); |
| 141 | 139 | ||
| 142 | return ret; | 140 | return ret; |
| @@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs) | |||
| 153 | rs, &rs->rs_bound_addr, | 151 | rs, &rs->rs_bound_addr, |
| 154 | ntohs(rs->rs_bound_port)); | 152 | ntohs(rs->rs_bound_port)); |
| 155 | 153 | ||
| 156 | rb_erase(&rs->rs_bound_node, &rds_bind_tree); | 154 | hlist_del_init_rcu(&rs->rs_bound_node); |
| 157 | rds_sock_put(rs); | 155 | rds_sock_put(rs); |
| 158 | rs->rs_bound_addr = 0; | 156 | rs->rs_bound_addr = 0; |
| 159 | } | 157 | } |
| @@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
| 184 | goto out; | 182 | goto out; |
| 185 | 183 | ||
| 186 | trans = rds_trans_get_preferred(sin->sin_addr.s_addr); | 184 | trans = rds_trans_get_preferred(sin->sin_addr.s_addr); |
| 187 | if (trans == NULL) { | 185 | if (!trans) { |
| 188 | ret = -EADDRNOTAVAIL; | 186 | ret = -EADDRNOTAVAIL; |
| 189 | rds_remove_bound(rs); | 187 | rds_remove_bound(rs); |
| 190 | if (printk_ratelimit()) | 188 | if (printk_ratelimit()) |
| @@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
| 198 | 196 | ||
| 199 | out: | 197 | out: |
| 200 | release_sock(sk); | 198 | release_sock(sk); |
| 199 | |||
| 200 | /* we might have called rds_remove_bound on error */ | ||
| 201 | if (ret) | ||
| 202 | synchronize_rcu(); | ||
| 201 | return ret; | 203 | return ret; |
| 202 | } | 204 | } |
diff --git a/net/rds/cong.c b/net/rds/cong.c index 0871a29f0780..75ea686f27d5 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c | |||
| @@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) | |||
| 141 | unsigned long flags; | 141 | unsigned long flags; |
| 142 | 142 | ||
| 143 | map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); | 143 | map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); |
| 144 | if (map == NULL) | 144 | if (!map) |
| 145 | return NULL; | 145 | return NULL; |
| 146 | 146 | ||
| 147 | map->m_addr = addr; | 147 | map->m_addr = addr; |
| @@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) | |||
| 159 | ret = rds_cong_tree_walk(addr, map); | 159 | ret = rds_cong_tree_walk(addr, map); |
| 160 | spin_unlock_irqrestore(&rds_cong_lock, flags); | 160 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
| 161 | 161 | ||
| 162 | if (ret == NULL) { | 162 | if (!ret) { |
| 163 | ret = map; | 163 | ret = map; |
| 164 | map = NULL; | 164 | map = NULL; |
| 165 | } | 165 | } |
| @@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) | |||
| 205 | conn->c_lcong = rds_cong_from_addr(conn->c_laddr); | 205 | conn->c_lcong = rds_cong_from_addr(conn->c_laddr); |
| 206 | conn->c_fcong = rds_cong_from_addr(conn->c_faddr); | 206 | conn->c_fcong = rds_cong_from_addr(conn->c_faddr); |
| 207 | 207 | ||
| 208 | if (conn->c_lcong == NULL || conn->c_fcong == NULL) | 208 | if (!(conn->c_lcong && conn->c_fcong)) |
| 209 | return -ENOMEM; | 209 | return -ENOMEM; |
| 210 | 210 | ||
| 211 | return 0; | 211 | return 0; |
| @@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) | |||
| 221 | list_for_each_entry(conn, &map->m_conn_list, c_map_item) { | 221 | list_for_each_entry(conn, &map->m_conn_list, c_map_item) { |
| 222 | if (!test_and_set_bit(0, &conn->c_map_queued)) { | 222 | if (!test_and_set_bit(0, &conn->c_map_queued)) { |
| 223 | rds_stats_inc(s_cong_update_queued); | 223 | rds_stats_inc(s_cong_update_queued); |
| 224 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | 224 | rds_send_xmit(conn); |
| 225 | } | 225 | } |
| 226 | } | 226 | } |
| 227 | 227 | ||
diff --git a/net/rds/connection.c b/net/rds/connection.c index 7619b671ca28..870992e08cae 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c | |||
| @@ -37,7 +37,6 @@ | |||
| 37 | 37 | ||
| 38 | #include "rds.h" | 38 | #include "rds.h" |
| 39 | #include "loop.h" | 39 | #include "loop.h" |
| 40 | #include "rdma.h" | ||
| 41 | 40 | ||
| 42 | #define RDS_CONNECTION_HASH_BITS 12 | 41 | #define RDS_CONNECTION_HASH_BITS 12 |
| 43 | #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) | 42 | #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) |
| @@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) | |||
| 63 | var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ | 62 | var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ |
| 64 | } while (0) | 63 | } while (0) |
| 65 | 64 | ||
| 66 | static inline int rds_conn_is_sending(struct rds_connection *conn) | 65 | /* rcu read lock must be held or the connection spinlock */ |
| 67 | { | ||
| 68 | int ret = 0; | ||
| 69 | |||
| 70 | if (!mutex_trylock(&conn->c_send_lock)) | ||
| 71 | ret = 1; | ||
| 72 | else | ||
| 73 | mutex_unlock(&conn->c_send_lock); | ||
| 74 | |||
| 75 | return ret; | ||
| 76 | } | ||
| 77 | |||
| 78 | static struct rds_connection *rds_conn_lookup(struct hlist_head *head, | 66 | static struct rds_connection *rds_conn_lookup(struct hlist_head *head, |
| 79 | __be32 laddr, __be32 faddr, | 67 | __be32 laddr, __be32 faddr, |
| 80 | struct rds_transport *trans) | 68 | struct rds_transport *trans) |
| @@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, | |||
| 82 | struct rds_connection *conn, *ret = NULL; | 70 | struct rds_connection *conn, *ret = NULL; |
| 83 | struct hlist_node *pos; | 71 | struct hlist_node *pos; |
| 84 | 72 | ||
| 85 | hlist_for_each_entry(conn, pos, head, c_hash_node) { | 73 | hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { |
| 86 | if (conn->c_faddr == faddr && conn->c_laddr == laddr && | 74 | if (conn->c_faddr == faddr && conn->c_laddr == laddr && |
| 87 | conn->c_trans == trans) { | 75 | conn->c_trans == trans) { |
| 88 | ret = conn; | 76 | ret = conn; |
| @@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
| 129 | { | 117 | { |
| 130 | struct rds_connection *conn, *parent = NULL; | 118 | struct rds_connection *conn, *parent = NULL; |
| 131 | struct hlist_head *head = rds_conn_bucket(laddr, faddr); | 119 | struct hlist_head *head = rds_conn_bucket(laddr, faddr); |
| 120 | struct rds_transport *loop_trans; | ||
| 132 | unsigned long flags; | 121 | unsigned long flags; |
| 133 | int ret; | 122 | int ret; |
| 134 | 123 | ||
| 135 | spin_lock_irqsave(&rds_conn_lock, flags); | 124 | rcu_read_lock(); |
| 136 | conn = rds_conn_lookup(head, laddr, faddr, trans); | 125 | conn = rds_conn_lookup(head, laddr, faddr, trans); |
| 137 | if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && | 126 | if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && |
| 138 | !is_outgoing) { | 127 | !is_outgoing) { |
| @@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
| 143 | parent = conn; | 132 | parent = conn; |
| 144 | conn = parent->c_passive; | 133 | conn = parent->c_passive; |
| 145 | } | 134 | } |
| 146 | spin_unlock_irqrestore(&rds_conn_lock, flags); | 135 | rcu_read_unlock(); |
| 147 | if (conn) | 136 | if (conn) |
| 148 | goto out; | 137 | goto out; |
| 149 | 138 | ||
| 150 | conn = kmem_cache_zalloc(rds_conn_slab, gfp); | 139 | conn = kmem_cache_zalloc(rds_conn_slab, gfp); |
| 151 | if (conn == NULL) { | 140 | if (!conn) { |
| 152 | conn = ERR_PTR(-ENOMEM); | 141 | conn = ERR_PTR(-ENOMEM); |
| 153 | goto out; | 142 | goto out; |
| 154 | } | 143 | } |
| @@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
| 159 | spin_lock_init(&conn->c_lock); | 148 | spin_lock_init(&conn->c_lock); |
| 160 | conn->c_next_tx_seq = 1; | 149 | conn->c_next_tx_seq = 1; |
| 161 | 150 | ||
| 162 | mutex_init(&conn->c_send_lock); | 151 | init_waitqueue_head(&conn->c_waitq); |
| 163 | INIT_LIST_HEAD(&conn->c_send_queue); | 152 | INIT_LIST_HEAD(&conn->c_send_queue); |
| 164 | INIT_LIST_HEAD(&conn->c_retrans); | 153 | INIT_LIST_HEAD(&conn->c_retrans); |
| 165 | 154 | ||
| @@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
| 175 | * can bind to the destination address then we'd rather the messages | 164 | * can bind to the destination address then we'd rather the messages |
| 176 | * flow through loopback rather than either transport. | 165 | * flow through loopback rather than either transport. |
| 177 | */ | 166 | */ |
| 178 | if (rds_trans_get_preferred(faddr)) { | 167 | loop_trans = rds_trans_get_preferred(faddr); |
| 168 | if (loop_trans) { | ||
| 169 | rds_trans_put(loop_trans); | ||
| 179 | conn->c_loopback = 1; | 170 | conn->c_loopback = 1; |
| 180 | if (is_outgoing && trans->t_prefer_loopback) { | 171 | if (is_outgoing && trans->t_prefer_loopback) { |
| 181 | /* "outgoing" connection - and the transport | 172 | /* "outgoing" connection - and the transport |
| @@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
| 238 | kmem_cache_free(rds_conn_slab, conn); | 229 | kmem_cache_free(rds_conn_slab, conn); |
| 239 | conn = found; | 230 | conn = found; |
| 240 | } else { | 231 | } else { |
| 241 | hlist_add_head(&conn->c_hash_node, head); | 232 | hlist_add_head_rcu(&conn->c_hash_node, head); |
| 242 | rds_cong_add_conn(conn); | 233 | rds_cong_add_conn(conn); |
| 243 | rds_conn_count++; | 234 | rds_conn_count++; |
| 244 | } | 235 | } |
| @@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | |||
| 263 | } | 254 | } |
| 264 | EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); | 255 | EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); |
| 265 | 256 | ||
| 257 | void rds_conn_shutdown(struct rds_connection *conn) | ||
| 258 | { | ||
| 259 | /* shut it down unless it's down already */ | ||
| 260 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { | ||
| 261 | /* | ||
| 262 | * Quiesce the connection mgmt handlers before we start tearing | ||
| 263 | * things down. We don't hold the mutex for the entire | ||
| 264 | * duration of the shutdown operation, else we may be | ||
| 265 | * deadlocking with the CM handler. Instead, the CM event | ||
| 266 | * handler is supposed to check for state DISCONNECTING | ||
| 267 | */ | ||
| 268 | mutex_lock(&conn->c_cm_lock); | ||
| 269 | if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) | ||
| 270 | && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { | ||
| 271 | rds_conn_error(conn, "shutdown called in state %d\n", | ||
| 272 | atomic_read(&conn->c_state)); | ||
| 273 | mutex_unlock(&conn->c_cm_lock); | ||
| 274 | return; | ||
| 275 | } | ||
| 276 | mutex_unlock(&conn->c_cm_lock); | ||
| 277 | |||
| 278 | wait_event(conn->c_waitq, | ||
| 279 | !test_bit(RDS_IN_XMIT, &conn->c_flags)); | ||
| 280 | |||
| 281 | conn->c_trans->conn_shutdown(conn); | ||
| 282 | rds_conn_reset(conn); | ||
| 283 | |||
| 284 | if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { | ||
| 285 | /* This can happen - eg when we're in the middle of tearing | ||
| 286 | * down the connection, and someone unloads the rds module. | ||
| 287 | * Quite reproduceable with loopback connections. | ||
| 288 | * Mostly harmless. | ||
| 289 | */ | ||
| 290 | rds_conn_error(conn, | ||
| 291 | "%s: failed to transition to state DOWN, " | ||
| 292 | "current state is %d\n", | ||
| 293 | __func__, | ||
| 294 | atomic_read(&conn->c_state)); | ||
| 295 | return; | ||
| 296 | } | ||
| 297 | } | ||
| 298 | |||
| 299 | /* Then reconnect if it's still live. | ||
| 300 | * The passive side of an IB loopback connection is never added | ||
| 301 | * to the conn hash, so we never trigger a reconnect on this | ||
| 302 | * conn - the reconnect is always triggered by the active peer. */ | ||
| 303 | cancel_delayed_work_sync(&conn->c_conn_w); | ||
| 304 | rcu_read_lock(); | ||
| 305 | if (!hlist_unhashed(&conn->c_hash_node)) { | ||
| 306 | rcu_read_unlock(); | ||
| 307 | rds_queue_reconnect(conn); | ||
| 308 | } else { | ||
| 309 | rcu_read_unlock(); | ||
| 310 | } | ||
| 311 | } | ||
| 312 | |||
| 313 | /* | ||
| 314 | * Stop and free a connection. | ||
| 315 | * | ||
| 316 | * This can only be used in very limited circumstances. It assumes that once | ||
| 317 | * the conn has been shutdown that no one else is referencing the connection. | ||
| 318 | * We can only ensure this in the rmmod path in the current code. | ||
| 319 | */ | ||
| 266 | void rds_conn_destroy(struct rds_connection *conn) | 320 | void rds_conn_destroy(struct rds_connection *conn) |
| 267 | { | 321 | { |
| 268 | struct rds_message *rm, *rtmp; | 322 | struct rds_message *rm, *rtmp; |
| 323 | unsigned long flags; | ||
| 269 | 324 | ||
| 270 | rdsdebug("freeing conn %p for %pI4 -> " | 325 | rdsdebug("freeing conn %p for %pI4 -> " |
| 271 | "%pI4\n", conn, &conn->c_laddr, | 326 | "%pI4\n", conn, &conn->c_laddr, |
| 272 | &conn->c_faddr); | 327 | &conn->c_faddr); |
| 273 | 328 | ||
| 274 | hlist_del_init(&conn->c_hash_node); | 329 | /* Ensure conn will not be scheduled for reconnect */ |
| 330 | spin_lock_irq(&rds_conn_lock); | ||
| 331 | hlist_del_init_rcu(&conn->c_hash_node); | ||
| 332 | spin_unlock_irq(&rds_conn_lock); | ||
| 333 | synchronize_rcu(); | ||
| 275 | 334 | ||
| 276 | /* wait for the rds thread to shut it down */ | 335 | /* shut the connection down */ |
| 277 | atomic_set(&conn->c_state, RDS_CONN_ERROR); | 336 | rds_conn_drop(conn); |
| 278 | cancel_delayed_work(&conn->c_conn_w); | 337 | flush_work(&conn->c_down_w); |
| 279 | queue_work(rds_wq, &conn->c_down_w); | 338 | |
| 280 | flush_workqueue(rds_wq); | 339 | /* make sure lingering queued work won't try to ref the conn */ |
| 340 | cancel_delayed_work_sync(&conn->c_send_w); | ||
| 341 | cancel_delayed_work_sync(&conn->c_recv_w); | ||
| 281 | 342 | ||
| 282 | /* tear down queued messages */ | 343 | /* tear down queued messages */ |
| 283 | list_for_each_entry_safe(rm, rtmp, | 344 | list_for_each_entry_safe(rm, rtmp, |
| @@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn) | |||
| 302 | BUG_ON(!list_empty(&conn->c_retrans)); | 363 | BUG_ON(!list_empty(&conn->c_retrans)); |
| 303 | kmem_cache_free(rds_conn_slab, conn); | 364 | kmem_cache_free(rds_conn_slab, conn); |
| 304 | 365 | ||
| 366 | spin_lock_irqsave(&rds_conn_lock, flags); | ||
| 305 | rds_conn_count--; | 367 | rds_conn_count--; |
| 368 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
| 306 | } | 369 | } |
| 307 | EXPORT_SYMBOL_GPL(rds_conn_destroy); | 370 | EXPORT_SYMBOL_GPL(rds_conn_destroy); |
| 308 | 371 | ||
| @@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, | |||
| 316 | struct list_head *list; | 379 | struct list_head *list; |
| 317 | struct rds_connection *conn; | 380 | struct rds_connection *conn; |
| 318 | struct rds_message *rm; | 381 | struct rds_message *rm; |
| 319 | unsigned long flags; | ||
| 320 | unsigned int total = 0; | 382 | unsigned int total = 0; |
| 383 | unsigned long flags; | ||
| 321 | size_t i; | 384 | size_t i; |
| 322 | 385 | ||
| 323 | len /= sizeof(struct rds_info_message); | 386 | len /= sizeof(struct rds_info_message); |
| 324 | 387 | ||
| 325 | spin_lock_irqsave(&rds_conn_lock, flags); | 388 | rcu_read_lock(); |
| 326 | 389 | ||
| 327 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); | 390 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
| 328 | i++, head++) { | 391 | i++, head++) { |
| 329 | hlist_for_each_entry(conn, pos, head, c_hash_node) { | 392 | hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { |
| 330 | if (want_send) | 393 | if (want_send) |
| 331 | list = &conn->c_send_queue; | 394 | list = &conn->c_send_queue; |
| 332 | else | 395 | else |
| 333 | list = &conn->c_retrans; | 396 | list = &conn->c_retrans; |
| 334 | 397 | ||
| 335 | spin_lock(&conn->c_lock); | 398 | spin_lock_irqsave(&conn->c_lock, flags); |
| 336 | 399 | ||
| 337 | /* XXX too lazy to maintain counts.. */ | 400 | /* XXX too lazy to maintain counts.. */ |
| 338 | list_for_each_entry(rm, list, m_conn_item) { | 401 | list_for_each_entry(rm, list, m_conn_item) { |
| @@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, | |||
| 343 | conn->c_faddr, 0); | 406 | conn->c_faddr, 0); |
| 344 | } | 407 | } |
| 345 | 408 | ||
| 346 | spin_unlock(&conn->c_lock); | 409 | spin_unlock_irqrestore(&conn->c_lock, flags); |
| 347 | } | 410 | } |
| 348 | } | 411 | } |
| 349 | 412 | rcu_read_unlock(); | |
| 350 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
| 351 | 413 | ||
| 352 | lens->nr = total; | 414 | lens->nr = total; |
| 353 | lens->each = sizeof(struct rds_info_message); | 415 | lens->each = sizeof(struct rds_info_message); |
| @@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, | |||
| 377 | uint64_t buffer[(item_len + 7) / 8]; | 439 | uint64_t buffer[(item_len + 7) / 8]; |
| 378 | struct hlist_head *head; | 440 | struct hlist_head *head; |
| 379 | struct hlist_node *pos; | 441 | struct hlist_node *pos; |
| 380 | struct hlist_node *tmp; | ||
| 381 | struct rds_connection *conn; | 442 | struct rds_connection *conn; |
| 382 | unsigned long flags; | ||
| 383 | size_t i; | 443 | size_t i; |
| 384 | 444 | ||
| 385 | spin_lock_irqsave(&rds_conn_lock, flags); | 445 | rcu_read_lock(); |
| 386 | 446 | ||
| 387 | lens->nr = 0; | 447 | lens->nr = 0; |
| 388 | lens->each = item_len; | 448 | lens->each = item_len; |
| 389 | 449 | ||
| 390 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); | 450 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
| 391 | i++, head++) { | 451 | i++, head++) { |
| 392 | hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { | 452 | hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { |
| 393 | 453 | ||
| 394 | /* XXX no c_lock usage.. */ | 454 | /* XXX no c_lock usage.. */ |
| 395 | if (!visitor(conn, buffer)) | 455 | if (!visitor(conn, buffer)) |
| @@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, | |||
| 405 | lens->nr++; | 465 | lens->nr++; |
| 406 | } | 466 | } |
| 407 | } | 467 | } |
| 408 | 468 | rcu_read_unlock(); | |
| 409 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
| 410 | } | 469 | } |
| 411 | EXPORT_SYMBOL_GPL(rds_for_each_conn_info); | 470 | EXPORT_SYMBOL_GPL(rds_for_each_conn_info); |
| 412 | 471 | ||
| @@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, | |||
| 423 | sizeof(cinfo->transport)); | 482 | sizeof(cinfo->transport)); |
| 424 | cinfo->flags = 0; | 483 | cinfo->flags = 0; |
| 425 | 484 | ||
| 426 | rds_conn_info_set(cinfo->flags, | 485 | rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), |
| 427 | rds_conn_is_sending(conn), SENDING); | 486 | SENDING); |
| 428 | /* XXX Future: return the state rather than these funky bits */ | 487 | /* XXX Future: return the state rather than these funky bits */ |
| 429 | rds_conn_info_set(cinfo->flags, | 488 | rds_conn_info_set(cinfo->flags, |
| 430 | atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, | 489 | atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, |
| @@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len, | |||
| 444 | sizeof(struct rds_info_connection)); | 503 | sizeof(struct rds_info_connection)); |
| 445 | } | 504 | } |
| 446 | 505 | ||
| 447 | int __init rds_conn_init(void) | 506 | int rds_conn_init(void) |
| 448 | { | 507 | { |
| 449 | rds_conn_slab = kmem_cache_create("rds_connection", | 508 | rds_conn_slab = kmem_cache_create("rds_connection", |
| 450 | sizeof(struct rds_connection), | 509 | sizeof(struct rds_connection), |
| 451 | 0, 0, NULL); | 510 | 0, 0, NULL); |
| 452 | if (rds_conn_slab == NULL) | 511 | if (!rds_conn_slab) |
| 453 | return -ENOMEM; | 512 | return -ENOMEM; |
| 454 | 513 | ||
| 455 | rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); | 514 | rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); |
| @@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn) | |||
| 487 | EXPORT_SYMBOL_GPL(rds_conn_drop); | 546 | EXPORT_SYMBOL_GPL(rds_conn_drop); |
| 488 | 547 | ||
| 489 | /* | 548 | /* |
| 549 | * If the connection is down, trigger a connect. We may have scheduled a | ||
| 550 | * delayed reconnect however - in this case we should not interfere. | ||
| 551 | */ | ||
| 552 | void rds_conn_connect_if_down(struct rds_connection *conn) | ||
| 553 | { | ||
| 554 | if (rds_conn_state(conn) == RDS_CONN_DOWN && | ||
| 555 | !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | ||
| 556 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
| 557 | } | ||
| 558 | EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); | ||
| 559 | |||
| 560 | /* | ||
| 490 | * An error occurred on the connection | 561 | * An error occurred on the connection |
| 491 | */ | 562 | */ |
| 492 | void | 563 | void |
diff --git a/net/rds/ib.c b/net/rds/ib.c index 8f2d6dd7700a..b12a3951167d 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c | |||
| @@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); | |||
| 53 | module_param(rds_ib_retry_count, int, 0444); | 53 | module_param(rds_ib_retry_count, int, 0444); |
| 54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); | 54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); |
| 55 | 55 | ||
| 56 | /* | ||
| 57 | * we have a clumsy combination of RCU and a rwsem protecting this list | ||
| 58 | * because it is used both in the get_mr fast path and while blocking in | ||
| 59 | * the FMR flushing path. | ||
| 60 | */ | ||
| 61 | DECLARE_RWSEM(rds_ib_devices_lock); | ||
| 56 | struct list_head rds_ib_devices; | 62 | struct list_head rds_ib_devices; |
| 57 | 63 | ||
| 58 | /* NOTE: if also grabbing ibdev lock, grab this first */ | 64 | /* NOTE: if also grabbing ibdev lock, grab this first */ |
| 59 | DEFINE_SPINLOCK(ib_nodev_conns_lock); | 65 | DEFINE_SPINLOCK(ib_nodev_conns_lock); |
| 60 | LIST_HEAD(ib_nodev_conns); | 66 | LIST_HEAD(ib_nodev_conns); |
| 61 | 67 | ||
| 68 | void rds_ib_nodev_connect(void) | ||
| 69 | { | ||
| 70 | struct rds_ib_connection *ic; | ||
| 71 | |||
| 72 | spin_lock(&ib_nodev_conns_lock); | ||
| 73 | list_for_each_entry(ic, &ib_nodev_conns, ib_node) | ||
| 74 | rds_conn_connect_if_down(ic->conn); | ||
| 75 | spin_unlock(&ib_nodev_conns_lock); | ||
| 76 | } | ||
| 77 | |||
| 78 | void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) | ||
| 79 | { | ||
| 80 | struct rds_ib_connection *ic; | ||
| 81 | unsigned long flags; | ||
| 82 | |||
| 83 | spin_lock_irqsave(&rds_ibdev->spinlock, flags); | ||
| 84 | list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) | ||
| 85 | rds_conn_drop(ic->conn); | ||
| 86 | spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references | ||
| 91 | * from interrupt context so we push freing off into a work struct in krdsd. | ||
| 92 | */ | ||
| 93 | static void rds_ib_dev_free(struct work_struct *work) | ||
| 94 | { | ||
| 95 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
| 96 | struct rds_ib_device *rds_ibdev = container_of(work, | ||
| 97 | struct rds_ib_device, free_work); | ||
| 98 | |||
| 99 | if (rds_ibdev->mr_pool) | ||
| 100 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||
| 101 | if (rds_ibdev->mr) | ||
| 102 | ib_dereg_mr(rds_ibdev->mr); | ||
| 103 | if (rds_ibdev->pd) | ||
| 104 | ib_dealloc_pd(rds_ibdev->pd); | ||
| 105 | |||
| 106 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | ||
| 107 | list_del(&i_ipaddr->list); | ||
| 108 | kfree(i_ipaddr); | ||
| 109 | } | ||
| 110 | |||
| 111 | kfree(rds_ibdev); | ||
| 112 | } | ||
| 113 | |||
| 114 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) | ||
| 115 | { | ||
| 116 | BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); | ||
| 117 | if (atomic_dec_and_test(&rds_ibdev->refcount)) | ||
| 118 | queue_work(rds_wq, &rds_ibdev->free_work); | ||
| 119 | } | ||
| 120 | |||
| 62 | void rds_ib_add_one(struct ib_device *device) | 121 | void rds_ib_add_one(struct ib_device *device) |
| 63 | { | 122 | { |
| 64 | struct rds_ib_device *rds_ibdev; | 123 | struct rds_ib_device *rds_ibdev; |
| @@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device) | |||
| 77 | goto free_attr; | 136 | goto free_attr; |
| 78 | } | 137 | } |
| 79 | 138 | ||
| 80 | rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); | 139 | rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, |
| 140 | ibdev_to_node(device)); | ||
| 81 | if (!rds_ibdev) | 141 | if (!rds_ibdev) |
| 82 | goto free_attr; | 142 | goto free_attr; |
| 83 | 143 | ||
| 84 | spin_lock_init(&rds_ibdev->spinlock); | 144 | spin_lock_init(&rds_ibdev->spinlock); |
| 145 | atomic_set(&rds_ibdev->refcount, 1); | ||
| 146 | INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); | ||
| 85 | 147 | ||
| 86 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; | 148 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; |
| 87 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); | 149 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); |
| @@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device) | |||
| 91 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : | 153 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : |
| 92 | fmr_pool_size; | 154 | fmr_pool_size; |
| 93 | 155 | ||
| 156 | rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; | ||
| 157 | rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; | ||
| 158 | |||
| 94 | rds_ibdev->dev = device; | 159 | rds_ibdev->dev = device; |
| 95 | rds_ibdev->pd = ib_alloc_pd(device); | 160 | rds_ibdev->pd = ib_alloc_pd(device); |
| 96 | if (IS_ERR(rds_ibdev->pd)) | 161 | if (IS_ERR(rds_ibdev->pd)) { |
| 97 | goto free_dev; | 162 | rds_ibdev->pd = NULL; |
| 163 | goto put_dev; | ||
| 164 | } | ||
| 98 | 165 | ||
| 99 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, | 166 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); |
| 100 | IB_ACCESS_LOCAL_WRITE); | 167 | if (IS_ERR(rds_ibdev->mr)) { |
| 101 | if (IS_ERR(rds_ibdev->mr)) | 168 | rds_ibdev->mr = NULL; |
| 102 | goto err_pd; | 169 | goto put_dev; |
| 170 | } | ||
| 103 | 171 | ||
| 104 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); | 172 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); |
| 105 | if (IS_ERR(rds_ibdev->mr_pool)) { | 173 | if (IS_ERR(rds_ibdev->mr_pool)) { |
| 106 | rds_ibdev->mr_pool = NULL; | 174 | rds_ibdev->mr_pool = NULL; |
| 107 | goto err_mr; | 175 | goto put_dev; |
| 108 | } | 176 | } |
| 109 | 177 | ||
| 110 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); | 178 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); |
| 111 | INIT_LIST_HEAD(&rds_ibdev->conn_list); | 179 | INIT_LIST_HEAD(&rds_ibdev->conn_list); |
| 112 | list_add_tail(&rds_ibdev->list, &rds_ib_devices); | 180 | |
| 181 | down_write(&rds_ib_devices_lock); | ||
| 182 | list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); | ||
| 183 | up_write(&rds_ib_devices_lock); | ||
| 184 | atomic_inc(&rds_ibdev->refcount); | ||
| 113 | 185 | ||
| 114 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); | 186 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); |
| 187 | atomic_inc(&rds_ibdev->refcount); | ||
| 115 | 188 | ||
| 116 | goto free_attr; | 189 | rds_ib_nodev_connect(); |
| 117 | 190 | ||
| 118 | err_mr: | 191 | put_dev: |
| 119 | ib_dereg_mr(rds_ibdev->mr); | 192 | rds_ib_dev_put(rds_ibdev); |
| 120 | err_pd: | ||
| 121 | ib_dealloc_pd(rds_ibdev->pd); | ||
| 122 | free_dev: | ||
| 123 | kfree(rds_ibdev); | ||
| 124 | free_attr: | 193 | free_attr: |
| 125 | kfree(dev_attr); | 194 | kfree(dev_attr); |
| 126 | } | 195 | } |
| 127 | 196 | ||
| 197 | /* | ||
| 198 | * New connections use this to find the device to associate with the | ||
| 199 | * connection. It's not in the fast path so we're not concerned about the | ||
| 200 | * performance of the IB call. (As of this writing, it uses an interrupt | ||
| 201 | * blocking spinlock to serialize walking a per-device list of all registered | ||
| 202 | * clients.) | ||
| 203 | * | ||
| 204 | * RCU is used to handle incoming connections racing with device teardown. | ||
| 205 | * Rather than use a lock to serialize removal from the client_data and | ||
| 206 | * getting a new reference, we use an RCU grace period. The destruction | ||
| 207 | * path removes the device from client_data and then waits for all RCU | ||
| 208 | * readers to finish. | ||
| 209 | * | ||
| 210 | * A new connection can get NULL from this if its arriving on a | ||
| 211 | * device that is in the process of being removed. | ||
| 212 | */ | ||
| 213 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) | ||
| 214 | { | ||
| 215 | struct rds_ib_device *rds_ibdev; | ||
| 216 | |||
| 217 | rcu_read_lock(); | ||
| 218 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | ||
| 219 | if (rds_ibdev) | ||
| 220 | atomic_inc(&rds_ibdev->refcount); | ||
| 221 | rcu_read_unlock(); | ||
| 222 | return rds_ibdev; | ||
| 223 | } | ||
| 224 | |||
| 225 | /* | ||
| 226 | * The IB stack is letting us know that a device is going away. This can | ||
| 227 | * happen if the underlying HCA driver is removed or if PCI hotplug is removing | ||
| 228 | * the pci function, for example. | ||
| 229 | * | ||
| 230 | * This can be called at any time and can be racing with any other RDS path. | ||
| 231 | */ | ||
| 128 | void rds_ib_remove_one(struct ib_device *device) | 232 | void rds_ib_remove_one(struct ib_device *device) |
| 129 | { | 233 | { |
| 130 | struct rds_ib_device *rds_ibdev; | 234 | struct rds_ib_device *rds_ibdev; |
| 131 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
| 132 | 235 | ||
| 133 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | 236 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); |
| 134 | if (!rds_ibdev) | 237 | if (!rds_ibdev) |
| 135 | return; | 238 | return; |
| 136 | 239 | ||
| 137 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | 240 | rds_ib_dev_shutdown(rds_ibdev); |
| 138 | list_del(&i_ipaddr->list); | ||
| 139 | kfree(i_ipaddr); | ||
| 140 | } | ||
| 141 | 241 | ||
| 142 | rds_ib_destroy_conns(rds_ibdev); | 242 | /* stop connection attempts from getting a reference to this device. */ |
| 243 | ib_set_client_data(device, &rds_ib_client, NULL); | ||
| 143 | 244 | ||
| 144 | if (rds_ibdev->mr_pool) | 245 | down_write(&rds_ib_devices_lock); |
| 145 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | 246 | list_del_rcu(&rds_ibdev->list); |
| 146 | 247 | up_write(&rds_ib_devices_lock); | |
| 147 | ib_dereg_mr(rds_ibdev->mr); | ||
| 148 | |||
| 149 | while (ib_dealloc_pd(rds_ibdev->pd)) { | ||
| 150 | rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); | ||
| 151 | msleep(1); | ||
| 152 | } | ||
| 153 | 248 | ||
| 154 | list_del(&rds_ibdev->list); | 249 | /* |
| 155 | kfree(rds_ibdev); | 250 | * This synchronize rcu is waiting for readers of both the ib |
| 251 | * client data and the devices list to finish before we drop | ||
| 252 | * both of those references. | ||
| 253 | */ | ||
| 254 | synchronize_rcu(); | ||
| 255 | rds_ib_dev_put(rds_ibdev); | ||
| 256 | rds_ib_dev_put(rds_ibdev); | ||
| 156 | } | 257 | } |
| 157 | 258 | ||
| 158 | struct ib_client rds_ib_client = { | 259 | struct ib_client rds_ib_client = { |
| @@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, | |||
| 186 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | 287 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); |
| 187 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | 288 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); |
| 188 | 289 | ||
| 189 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 290 | rds_ibdev = ic->rds_ibdev; |
| 190 | iinfo->max_send_wr = ic->i_send_ring.w_nr; | 291 | iinfo->max_send_wr = ic->i_send_ring.w_nr; |
| 191 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | 292 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; |
| 192 | iinfo->max_send_sge = rds_ibdev->max_sge; | 293 | iinfo->max_send_sge = rds_ibdev->max_sge; |
| @@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr) | |||
| 248 | return ret; | 349 | return ret; |
| 249 | } | 350 | } |
| 250 | 351 | ||
| 352 | static void rds_ib_unregister_client(void) | ||
| 353 | { | ||
| 354 | ib_unregister_client(&rds_ib_client); | ||
| 355 | /* wait for rds_ib_dev_free() to complete */ | ||
| 356 | flush_workqueue(rds_wq); | ||
| 357 | } | ||
| 358 | |||
| 251 | void rds_ib_exit(void) | 359 | void rds_ib_exit(void) |
| 252 | { | 360 | { |
| 253 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | 361 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); |
| 362 | rds_ib_unregister_client(); | ||
| 254 | rds_ib_destroy_nodev_conns(); | 363 | rds_ib_destroy_nodev_conns(); |
| 255 | ib_unregister_client(&rds_ib_client); | ||
| 256 | rds_ib_sysctl_exit(); | 364 | rds_ib_sysctl_exit(); |
| 257 | rds_ib_recv_exit(); | 365 | rds_ib_recv_exit(); |
| 258 | rds_trans_unregister(&rds_ib_transport); | 366 | rds_trans_unregister(&rds_ib_transport); |
| 367 | rds_ib_fmr_exit(); | ||
| 259 | } | 368 | } |
| 260 | 369 | ||
| 261 | struct rds_transport rds_ib_transport = { | 370 | struct rds_transport rds_ib_transport = { |
| 262 | .laddr_check = rds_ib_laddr_check, | 371 | .laddr_check = rds_ib_laddr_check, |
| 263 | .xmit_complete = rds_ib_xmit_complete, | 372 | .xmit_complete = rds_ib_xmit_complete, |
| 264 | .xmit = rds_ib_xmit, | 373 | .xmit = rds_ib_xmit, |
| 265 | .xmit_cong_map = NULL, | ||
| 266 | .xmit_rdma = rds_ib_xmit_rdma, | 374 | .xmit_rdma = rds_ib_xmit_rdma, |
| 375 | .xmit_atomic = rds_ib_xmit_atomic, | ||
| 267 | .recv = rds_ib_recv, | 376 | .recv = rds_ib_recv, |
| 268 | .conn_alloc = rds_ib_conn_alloc, | 377 | .conn_alloc = rds_ib_conn_alloc, |
| 269 | .conn_free = rds_ib_conn_free, | 378 | .conn_free = rds_ib_conn_free, |
| 270 | .conn_connect = rds_ib_conn_connect, | 379 | .conn_connect = rds_ib_conn_connect, |
| 271 | .conn_shutdown = rds_ib_conn_shutdown, | 380 | .conn_shutdown = rds_ib_conn_shutdown, |
| 272 | .inc_copy_to_user = rds_ib_inc_copy_to_user, | 381 | .inc_copy_to_user = rds_ib_inc_copy_to_user, |
| 273 | .inc_purge = rds_ib_inc_purge, | ||
| 274 | .inc_free = rds_ib_inc_free, | 382 | .inc_free = rds_ib_inc_free, |
| 275 | .cm_initiate_connect = rds_ib_cm_initiate_connect, | 383 | .cm_initiate_connect = rds_ib_cm_initiate_connect, |
| 276 | .cm_handle_connect = rds_ib_cm_handle_connect, | 384 | .cm_handle_connect = rds_ib_cm_handle_connect, |
| @@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = { | |||
| 286 | .t_type = RDS_TRANS_IB | 394 | .t_type = RDS_TRANS_IB |
| 287 | }; | 395 | }; |
| 288 | 396 | ||
| 289 | int __init rds_ib_init(void) | 397 | int rds_ib_init(void) |
| 290 | { | 398 | { |
| 291 | int ret; | 399 | int ret; |
| 292 | 400 | ||
| 293 | INIT_LIST_HEAD(&rds_ib_devices); | 401 | INIT_LIST_HEAD(&rds_ib_devices); |
| 294 | 402 | ||
| 295 | ret = ib_register_client(&rds_ib_client); | 403 | ret = rds_ib_fmr_init(); |
| 296 | if (ret) | 404 | if (ret) |
| 297 | goto out; | 405 | goto out; |
| 298 | 406 | ||
| 407 | ret = ib_register_client(&rds_ib_client); | ||
| 408 | if (ret) | ||
| 409 | goto out_fmr_exit; | ||
| 410 | |||
| 299 | ret = rds_ib_sysctl_init(); | 411 | ret = rds_ib_sysctl_init(); |
| 300 | if (ret) | 412 | if (ret) |
| 301 | goto out_ibreg; | 413 | goto out_ibreg; |
| @@ -317,7 +429,9 @@ out_recv: | |||
| 317 | out_sysctl: | 429 | out_sysctl: |
| 318 | rds_ib_sysctl_exit(); | 430 | rds_ib_sysctl_exit(); |
| 319 | out_ibreg: | 431 | out_ibreg: |
| 320 | ib_unregister_client(&rds_ib_client); | 432 | rds_ib_unregister_client(); |
| 433 | out_fmr_exit: | ||
| 434 | rds_ib_fmr_exit(); | ||
| 321 | out: | 435 | out: |
| 322 | return ret; | 436 | return ret; |
| 323 | } | 437 | } |
diff --git a/net/rds/ib.h b/net/rds/ib.h index 64df4e79b29f..7ad3d57e06a5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h | |||
| @@ -3,11 +3,13 @@ | |||
| 3 | 3 | ||
| 4 | #include <rdma/ib_verbs.h> | 4 | #include <rdma/ib_verbs.h> |
| 5 | #include <rdma/rdma_cm.h> | 5 | #include <rdma/rdma_cm.h> |
| 6 | #include <linux/pci.h> | ||
| 7 | #include <linux/slab.h> | ||
| 6 | #include "rds.h" | 8 | #include "rds.h" |
| 7 | #include "rdma_transport.h" | 9 | #include "rdma_transport.h" |
| 8 | 10 | ||
| 9 | #define RDS_FMR_SIZE 256 | 11 | #define RDS_FMR_SIZE 256 |
| 10 | #define RDS_FMR_POOL_SIZE 4096 | 12 | #define RDS_FMR_POOL_SIZE 8192 |
| 11 | 13 | ||
| 12 | #define RDS_IB_MAX_SGE 8 | 14 | #define RDS_IB_MAX_SGE 8 |
| 13 | #define RDS_IB_RECV_SGE 2 | 15 | #define RDS_IB_RECV_SGE 2 |
| @@ -19,6 +21,9 @@ | |||
| 19 | 21 | ||
| 20 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ | 22 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ |
| 21 | 23 | ||
| 24 | #define RDS_IB_RECYCLE_BATCH_COUNT 32 | ||
| 25 | |||
| 26 | extern struct rw_semaphore rds_ib_devices_lock; | ||
| 22 | extern struct list_head rds_ib_devices; | 27 | extern struct list_head rds_ib_devices; |
| 23 | 28 | ||
| 24 | /* | 29 | /* |
| @@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices; | |||
| 26 | * try and minimize the amount of memory tied up both the device and | 31 | * try and minimize the amount of memory tied up both the device and |
| 27 | * socket receive queues. | 32 | * socket receive queues. |
| 28 | */ | 33 | */ |
| 29 | /* page offset of the final full frag that fits in the page */ | ||
| 30 | #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) | ||
| 31 | struct rds_page_frag { | 34 | struct rds_page_frag { |
| 32 | struct list_head f_item; | 35 | struct list_head f_item; |
| 33 | struct page *f_page; | 36 | struct list_head f_cache_entry; |
| 34 | unsigned long f_offset; | 37 | struct scatterlist f_sg; |
| 35 | dma_addr_t f_mapped; | ||
| 36 | }; | 38 | }; |
| 37 | 39 | ||
| 38 | struct rds_ib_incoming { | 40 | struct rds_ib_incoming { |
| 39 | struct list_head ii_frags; | 41 | struct list_head ii_frags; |
| 42 | struct list_head ii_cache_entry; | ||
| 40 | struct rds_incoming ii_inc; | 43 | struct rds_incoming ii_inc; |
| 41 | }; | 44 | }; |
| 42 | 45 | ||
| 46 | struct rds_ib_cache_head { | ||
| 47 | struct list_head *first; | ||
| 48 | unsigned long count; | ||
| 49 | }; | ||
| 50 | |||
| 51 | struct rds_ib_refill_cache { | ||
| 52 | struct rds_ib_cache_head *percpu; | ||
| 53 | struct list_head *xfer; | ||
| 54 | struct list_head *ready; | ||
| 55 | }; | ||
| 56 | |||
| 43 | struct rds_ib_connect_private { | 57 | struct rds_ib_connect_private { |
| 44 | /* Add new fields at the end, and don't permute existing fields. */ | 58 | /* Add new fields at the end, and don't permute existing fields. */ |
| 45 | __be32 dp_saddr; | 59 | __be32 dp_saddr; |
| @@ -53,8 +67,7 @@ struct rds_ib_connect_private { | |||
| 53 | }; | 67 | }; |
| 54 | 68 | ||
| 55 | struct rds_ib_send_work { | 69 | struct rds_ib_send_work { |
| 56 | struct rds_message *s_rm; | 70 | void *s_op; |
| 57 | struct rds_rdma_op *s_op; | ||
| 58 | struct ib_send_wr s_wr; | 71 | struct ib_send_wr s_wr; |
| 59 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; | 72 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; |
| 60 | unsigned long s_queued; | 73 | unsigned long s_queued; |
| @@ -92,10 +105,11 @@ struct rds_ib_connection { | |||
| 92 | 105 | ||
| 93 | /* tx */ | 106 | /* tx */ |
| 94 | struct rds_ib_work_ring i_send_ring; | 107 | struct rds_ib_work_ring i_send_ring; |
| 95 | struct rds_message *i_rm; | 108 | struct rm_data_op *i_data_op; |
| 96 | struct rds_header *i_send_hdrs; | 109 | struct rds_header *i_send_hdrs; |
| 97 | u64 i_send_hdrs_dma; | 110 | u64 i_send_hdrs_dma; |
| 98 | struct rds_ib_send_work *i_sends; | 111 | struct rds_ib_send_work *i_sends; |
| 112 | atomic_t i_signaled_sends; | ||
| 99 | 113 | ||
| 100 | /* rx */ | 114 | /* rx */ |
| 101 | struct tasklet_struct i_recv_tasklet; | 115 | struct tasklet_struct i_recv_tasklet; |
| @@ -106,8 +120,9 @@ struct rds_ib_connection { | |||
| 106 | struct rds_header *i_recv_hdrs; | 120 | struct rds_header *i_recv_hdrs; |
| 107 | u64 i_recv_hdrs_dma; | 121 | u64 i_recv_hdrs_dma; |
| 108 | struct rds_ib_recv_work *i_recvs; | 122 | struct rds_ib_recv_work *i_recvs; |
| 109 | struct rds_page_frag i_frag; | ||
| 110 | u64 i_ack_recv; /* last ACK received */ | 123 | u64 i_ack_recv; /* last ACK received */ |
| 124 | struct rds_ib_refill_cache i_cache_incs; | ||
| 125 | struct rds_ib_refill_cache i_cache_frags; | ||
| 111 | 126 | ||
| 112 | /* sending acks */ | 127 | /* sending acks */ |
| 113 | unsigned long i_ack_flags; | 128 | unsigned long i_ack_flags; |
| @@ -138,7 +153,6 @@ struct rds_ib_connection { | |||
| 138 | 153 | ||
| 139 | /* Batched completions */ | 154 | /* Batched completions */ |
| 140 | unsigned int i_unsignaled_wrs; | 155 | unsigned int i_unsignaled_wrs; |
| 141 | long i_unsignaled_bytes; | ||
| 142 | }; | 156 | }; |
| 143 | 157 | ||
| 144 | /* This assumes that atomic_t is at least 32 bits */ | 158 | /* This assumes that atomic_t is at least 32 bits */ |
| @@ -164,9 +178,17 @@ struct rds_ib_device { | |||
| 164 | unsigned int max_fmrs; | 178 | unsigned int max_fmrs; |
| 165 | int max_sge; | 179 | int max_sge; |
| 166 | unsigned int max_wrs; | 180 | unsigned int max_wrs; |
| 181 | unsigned int max_initiator_depth; | ||
| 182 | unsigned int max_responder_resources; | ||
| 167 | spinlock_t spinlock; /* protect the above */ | 183 | spinlock_t spinlock; /* protect the above */ |
| 184 | atomic_t refcount; | ||
| 185 | struct work_struct free_work; | ||
| 168 | }; | 186 | }; |
| 169 | 187 | ||
| 188 | #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) | ||
| 189 | #define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device)) | ||
| 190 | #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) | ||
| 191 | |||
| 170 | /* bits for i_ack_flags */ | 192 | /* bits for i_ack_flags */ |
| 171 | #define IB_ACK_IN_FLIGHT 0 | 193 | #define IB_ACK_IN_FLIGHT 0 |
| 172 | #define IB_ACK_REQUESTED 1 | 194 | #define IB_ACK_REQUESTED 1 |
| @@ -202,6 +224,8 @@ struct rds_ib_statistics { | |||
| 202 | uint64_t s_ib_rdma_mr_pool_flush; | 224 | uint64_t s_ib_rdma_mr_pool_flush; |
| 203 | uint64_t s_ib_rdma_mr_pool_wait; | 225 | uint64_t s_ib_rdma_mr_pool_wait; |
| 204 | uint64_t s_ib_rdma_mr_pool_depleted; | 226 | uint64_t s_ib_rdma_mr_pool_depleted; |
| 227 | uint64_t s_ib_atomic_cswp; | ||
| 228 | uint64_t s_ib_atomic_fadd; | ||
| 205 | }; | 229 | }; |
| 206 | 230 | ||
| 207 | extern struct workqueue_struct *rds_ib_wq; | 231 | extern struct workqueue_struct *rds_ib_wq; |
| @@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, | |||
| 243 | extern struct rds_transport rds_ib_transport; | 267 | extern struct rds_transport rds_ib_transport; |
| 244 | extern void rds_ib_add_one(struct ib_device *device); | 268 | extern void rds_ib_add_one(struct ib_device *device); |
| 245 | extern void rds_ib_remove_one(struct ib_device *device); | 269 | extern void rds_ib_remove_one(struct ib_device *device); |
| 270 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); | ||
| 271 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); | ||
| 246 | extern struct ib_client rds_ib_client; | 272 | extern struct ib_client rds_ib_client; |
| 247 | 273 | ||
| 248 | extern unsigned int fmr_pool_size; | 274 | extern unsigned int fmr_pool_size; |
| @@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg); | |||
| 258 | int rds_ib_conn_connect(struct rds_connection *conn); | 284 | int rds_ib_conn_connect(struct rds_connection *conn); |
| 259 | void rds_ib_conn_shutdown(struct rds_connection *conn); | 285 | void rds_ib_conn_shutdown(struct rds_connection *conn); |
| 260 | void rds_ib_state_change(struct sock *sk); | 286 | void rds_ib_state_change(struct sock *sk); |
| 261 | int __init rds_ib_listen_init(void); | 287 | int rds_ib_listen_init(void); |
| 262 | void rds_ib_listen_stop(void); | 288 | void rds_ib_listen_stop(void); |
| 263 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); | 289 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); |
| 264 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | 290 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, |
| @@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, | |||
| 275 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); | 301 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); |
| 276 | void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | 302 | void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); |
| 277 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | 303 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); |
| 278 | void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); | 304 | void rds_ib_destroy_nodev_conns(void); |
| 279 | static inline void rds_ib_destroy_nodev_conns(void) | ||
| 280 | { | ||
| 281 | __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); | ||
| 282 | } | ||
| 283 | static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) | ||
| 284 | { | ||
| 285 | __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); | ||
| 286 | } | ||
| 287 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); | 305 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); |
| 288 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); | 306 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); |
| 289 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | 307 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); |
| @@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | |||
| 292 | void rds_ib_sync_mr(void *trans_private, int dir); | 310 | void rds_ib_sync_mr(void *trans_private, int dir); |
| 293 | void rds_ib_free_mr(void *trans_private, int invalidate); | 311 | void rds_ib_free_mr(void *trans_private, int invalidate); |
| 294 | void rds_ib_flush_mrs(void); | 312 | void rds_ib_flush_mrs(void); |
| 313 | int rds_ib_fmr_init(void); | ||
| 314 | void rds_ib_fmr_exit(void); | ||
| 295 | 315 | ||
| 296 | /* ib_recv.c */ | 316 | /* ib_recv.c */ |
| 297 | int __init rds_ib_recv_init(void); | 317 | int rds_ib_recv_init(void); |
| 298 | void rds_ib_recv_exit(void); | 318 | void rds_ib_recv_exit(void); |
| 299 | int rds_ib_recv(struct rds_connection *conn); | 319 | int rds_ib_recv(struct rds_connection *conn); |
| 300 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 320 | int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); |
| 301 | gfp_t page_gfp, int prefill); | 321 | void rds_ib_recv_free_caches(struct rds_ib_connection *ic); |
| 302 | void rds_ib_inc_purge(struct rds_incoming *inc); | 322 | void rds_ib_recv_refill(struct rds_connection *conn, int prefill); |
| 303 | void rds_ib_inc_free(struct rds_incoming *inc); | 323 | void rds_ib_inc_free(struct rds_incoming *inc); |
| 304 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | 324 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, |
| 305 | size_t size); | 325 | size_t size); |
| @@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); | |||
| 325 | extern wait_queue_head_t rds_ib_ring_empty_wait; | 345 | extern wait_queue_head_t rds_ib_ring_empty_wait; |
| 326 | 346 | ||
| 327 | /* ib_send.c */ | 347 | /* ib_send.c */ |
| 348 | char *rds_ib_wc_status_str(enum ib_wc_status status); | ||
| 328 | void rds_ib_xmit_complete(struct rds_connection *conn); | 349 | void rds_ib_xmit_complete(struct rds_connection *conn); |
| 329 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | 350 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, |
| 330 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 351 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
| 331 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); | 352 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); |
| 332 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); | 353 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); |
| 333 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); | 354 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); |
| 334 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | 355 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); |
| 335 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); | 356 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); |
| 336 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); | 357 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); |
| 337 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, | 358 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, |
| 338 | u32 *adv_credits, int need_posted, int max_posted); | 359 | u32 *adv_credits, int need_posted, int max_posted); |
| 360 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); | ||
| 339 | 361 | ||
| 340 | /* ib_stats.c */ | 362 | /* ib_stats.c */ |
| 341 | DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); | 363 | DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); |
| @@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | |||
| 344 | unsigned int avail); | 366 | unsigned int avail); |
| 345 | 367 | ||
| 346 | /* ib_sysctl.c */ | 368 | /* ib_sysctl.c */ |
| 347 | int __init rds_ib_sysctl_init(void); | 369 | int rds_ib_sysctl_init(void); |
| 348 | void rds_ib_sysctl_exit(void); | 370 | void rds_ib_sysctl_exit(void); |
| 349 | extern unsigned long rds_ib_sysctl_max_send_wr; | 371 | extern unsigned long rds_ib_sysctl_max_send_wr; |
| 350 | extern unsigned long rds_ib_sysctl_max_recv_wr; | 372 | extern unsigned long rds_ib_sysctl_max_recv_wr; |
| @@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation; | |||
| 354 | extern unsigned int rds_ib_sysctl_flow_control; | 376 | extern unsigned int rds_ib_sysctl_flow_control; |
| 355 | extern ctl_table rds_ib_sysctl_table[]; | 377 | extern ctl_table rds_ib_sysctl_table[]; |
| 356 | 378 | ||
| 357 | /* | ||
| 358 | * Helper functions for getting/setting the header and data SGEs in | ||
| 359 | * RDS packets (not RDMA) | ||
| 360 | * | ||
| 361 | * From version 3.1 onwards, header is in front of data in the sge. | ||
| 362 | */ | ||
| 363 | static inline struct ib_sge * | ||
| 364 | rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
| 365 | { | ||
| 366 | if (ic->conn->c_version > RDS_PROTOCOL_3_0) | ||
| 367 | return &sge[0]; | ||
| 368 | else | ||
| 369 | return &sge[1]; | ||
| 370 | } | ||
| 371 | |||
| 372 | static inline struct ib_sge * | ||
| 373 | rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
| 374 | { | ||
| 375 | if (ic->conn->c_version > RDS_PROTOCOL_3_0) | ||
| 376 | return &sge[1]; | ||
| 377 | else | ||
| 378 | return &sge[0]; | ||
| 379 | } | ||
| 380 | |||
| 381 | #endif | 379 | #endif |
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f68832798db2..bc3dbc1ba61f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c | |||
| @@ -38,6 +38,36 @@ | |||
| 38 | #include "rds.h" | 38 | #include "rds.h" |
| 39 | #include "ib.h" | 39 | #include "ib.h" |
| 40 | 40 | ||
| 41 | static char *rds_ib_event_type_strings[] = { | ||
| 42 | #define RDS_IB_EVENT_STRING(foo) \ | ||
| 43 | [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) | ||
| 44 | RDS_IB_EVENT_STRING(CQ_ERR), | ||
| 45 | RDS_IB_EVENT_STRING(QP_FATAL), | ||
| 46 | RDS_IB_EVENT_STRING(QP_REQ_ERR), | ||
| 47 | RDS_IB_EVENT_STRING(QP_ACCESS_ERR), | ||
| 48 | RDS_IB_EVENT_STRING(COMM_EST), | ||
| 49 | RDS_IB_EVENT_STRING(SQ_DRAINED), | ||
| 50 | RDS_IB_EVENT_STRING(PATH_MIG), | ||
| 51 | RDS_IB_EVENT_STRING(PATH_MIG_ERR), | ||
| 52 | RDS_IB_EVENT_STRING(DEVICE_FATAL), | ||
| 53 | RDS_IB_EVENT_STRING(PORT_ACTIVE), | ||
| 54 | RDS_IB_EVENT_STRING(PORT_ERR), | ||
| 55 | RDS_IB_EVENT_STRING(LID_CHANGE), | ||
| 56 | RDS_IB_EVENT_STRING(PKEY_CHANGE), | ||
| 57 | RDS_IB_EVENT_STRING(SM_CHANGE), | ||
| 58 | RDS_IB_EVENT_STRING(SRQ_ERR), | ||
| 59 | RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), | ||
| 60 | RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), | ||
| 61 | RDS_IB_EVENT_STRING(CLIENT_REREGISTER), | ||
| 62 | #undef RDS_IB_EVENT_STRING | ||
| 63 | }; | ||
| 64 | |||
| 65 | static char *rds_ib_event_str(enum ib_event_type type) | ||
| 66 | { | ||
| 67 | return rds_str_array(rds_ib_event_type_strings, | ||
| 68 | ARRAY_SIZE(rds_ib_event_type_strings), type); | ||
| 69 | }; | ||
| 70 | |||
| 41 | /* | 71 | /* |
| 42 | * Set the selected protocol version | 72 | * Set the selected protocol version |
| 43 | */ | 73 | */ |
| @@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
| 95 | { | 125 | { |
| 96 | const struct rds_ib_connect_private *dp = NULL; | 126 | const struct rds_ib_connect_private *dp = NULL; |
| 97 | struct rds_ib_connection *ic = conn->c_transport_data; | 127 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 98 | struct rds_ib_device *rds_ibdev; | ||
| 99 | struct ib_qp_attr qp_attr; | 128 | struct ib_qp_attr qp_attr; |
| 100 | int err; | 129 | int err; |
| 101 | 130 | ||
| @@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
| 111 | } | 140 | } |
| 112 | } | 141 | } |
| 113 | 142 | ||
| 114 | printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", | 143 | if (conn->c_version < RDS_PROTOCOL(3,1)) { |
| 115 | &conn->c_faddr, | 144 | printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," |
| 116 | RDS_PROTOCOL_MAJOR(conn->c_version), | 145 | " no longer supported\n", |
| 117 | RDS_PROTOCOL_MINOR(conn->c_version), | 146 | &conn->c_faddr, |
| 118 | ic->i_flowctl ? ", flow control" : ""); | 147 | RDS_PROTOCOL_MAJOR(conn->c_version), |
| 148 | RDS_PROTOCOL_MINOR(conn->c_version)); | ||
| 149 | rds_conn_destroy(conn); | ||
| 150 | return; | ||
| 151 | } else { | ||
| 152 | printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", | ||
| 153 | &conn->c_faddr, | ||
| 154 | RDS_PROTOCOL_MAJOR(conn->c_version), | ||
| 155 | RDS_PROTOCOL_MINOR(conn->c_version), | ||
| 156 | ic->i_flowctl ? ", flow control" : ""); | ||
| 157 | } | ||
| 119 | 158 | ||
| 120 | /* | 159 | /* |
| 121 | * Init rings and fill recv. this needs to wait until protocol negotiation | 160 | * Init rings and fill recv. this needs to wait until protocol negotiation |
| @@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
| 125 | rds_ib_recv_init_ring(ic); | 164 | rds_ib_recv_init_ring(ic); |
| 126 | /* Post receive buffers - as a side effect, this will update | 165 | /* Post receive buffers - as a side effect, this will update |
| 127 | * the posted credit count. */ | 166 | * the posted credit count. */ |
| 128 | rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); | 167 | rds_ib_recv_refill(conn, 1); |
| 129 | 168 | ||
| 130 | /* Tune RNR behavior */ | 169 | /* Tune RNR behavior */ |
| 131 | rds_ib_tune_rnr(ic, &qp_attr); | 170 | rds_ib_tune_rnr(ic, &qp_attr); |
| @@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
| 135 | if (err) | 174 | if (err) |
| 136 | printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); | 175 | printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); |
| 137 | 176 | ||
| 138 | /* update ib_device with this local ipaddr & conn */ | 177 | /* update ib_device with this local ipaddr */ |
| 139 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 178 | err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); |
| 140 | err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); | ||
| 141 | if (err) | 179 | if (err) |
| 142 | printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); | 180 | printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", |
| 143 | rds_ib_add_conn(rds_ibdev, conn); | 181 | err); |
| 144 | 182 | ||
| 145 | /* If the peer gave us the last packet it saw, process this as if | 183 | /* If the peer gave us the last packet it saw, process this as if |
| 146 | * we had received a regular ACK. */ | 184 | * we had received a regular ACK. */ |
| @@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
| 153 | static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, | 191 | static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, |
| 154 | struct rdma_conn_param *conn_param, | 192 | struct rdma_conn_param *conn_param, |
| 155 | struct rds_ib_connect_private *dp, | 193 | struct rds_ib_connect_private *dp, |
| 156 | u32 protocol_version) | 194 | u32 protocol_version, |
| 195 | u32 max_responder_resources, | ||
| 196 | u32 max_initiator_depth) | ||
| 157 | { | 197 | { |
| 198 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
| 199 | struct rds_ib_device *rds_ibdev = ic->rds_ibdev; | ||
| 200 | |||
| 158 | memset(conn_param, 0, sizeof(struct rdma_conn_param)); | 201 | memset(conn_param, 0, sizeof(struct rdma_conn_param)); |
| 159 | /* XXX tune these? */ | 202 | |
| 160 | conn_param->responder_resources = 1; | 203 | conn_param->responder_resources = |
| 161 | conn_param->initiator_depth = 1; | 204 | min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); |
| 205 | conn_param->initiator_depth = | ||
| 206 | min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); | ||
| 162 | conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); | 207 | conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); |
| 163 | conn_param->rnr_retry_count = 7; | 208 | conn_param->rnr_retry_count = 7; |
| 164 | 209 | ||
| 165 | if (dp) { | 210 | if (dp) { |
| 166 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
| 167 | |||
| 168 | memset(dp, 0, sizeof(*dp)); | 211 | memset(dp, 0, sizeof(*dp)); |
| 169 | dp->dp_saddr = conn->c_laddr; | 212 | dp->dp_saddr = conn->c_laddr; |
| 170 | dp->dp_daddr = conn->c_faddr; | 213 | dp->dp_daddr = conn->c_faddr; |
| @@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, | |||
| 189 | 232 | ||
| 190 | static void rds_ib_cq_event_handler(struct ib_event *event, void *data) | 233 | static void rds_ib_cq_event_handler(struct ib_event *event, void *data) |
| 191 | { | 234 | { |
| 192 | rdsdebug("event %u data %p\n", event->event, data); | 235 | rdsdebug("event %u (%s) data %p\n", |
| 236 | event->event, rds_ib_event_str(event->event), data); | ||
| 193 | } | 237 | } |
| 194 | 238 | ||
| 195 | static void rds_ib_qp_event_handler(struct ib_event *event, void *data) | 239 | static void rds_ib_qp_event_handler(struct ib_event *event, void *data) |
| @@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) | |||
| 197 | struct rds_connection *conn = data; | 241 | struct rds_connection *conn = data; |
| 198 | struct rds_ib_connection *ic = conn->c_transport_data; | 242 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 199 | 243 | ||
| 200 | rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); | 244 | rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, |
| 245 | rds_ib_event_str(event->event)); | ||
| 201 | 246 | ||
| 202 | switch (event->event) { | 247 | switch (event->event) { |
| 203 | case IB_EVENT_COMM_EST: | 248 | case IB_EVENT_COMM_EST: |
| 204 | rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); | 249 | rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); |
| 205 | break; | 250 | break; |
| 206 | default: | 251 | default: |
| 207 | rdsdebug("Fatal QP Event %u " | 252 | rdsdebug("Fatal QP Event %u (%s) " |
| 208 | "- connection %pI4->%pI4, reconnecting\n", | 253 | "- connection %pI4->%pI4, reconnecting\n", |
| 209 | event->event, &conn->c_laddr, &conn->c_faddr); | 254 | event->event, rds_ib_event_str(event->event), |
| 255 | &conn->c_laddr, &conn->c_faddr); | ||
| 210 | rds_conn_drop(conn); | 256 | rds_conn_drop(conn); |
| 211 | break; | 257 | break; |
| 212 | } | 258 | } |
| @@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
| 224 | struct rds_ib_device *rds_ibdev; | 270 | struct rds_ib_device *rds_ibdev; |
| 225 | int ret; | 271 | int ret; |
| 226 | 272 | ||
| 227 | /* rds_ib_add_one creates a rds_ib_device object per IB device, | 273 | /* |
| 228 | * and allocates a protection domain, memory range and FMR pool | 274 | * It's normal to see a null device if an incoming connection races |
| 229 | * for each. If that fails for any reason, it will not register | 275 | * with device removal, so we don't print a warning. |
| 230 | * the rds_ibdev at all. | ||
| 231 | */ | 276 | */ |
| 232 | rds_ibdev = ib_get_client_data(dev, &rds_ib_client); | 277 | rds_ibdev = rds_ib_get_client_data(dev); |
| 233 | if (rds_ibdev == NULL) { | 278 | if (!rds_ibdev) |
| 234 | if (printk_ratelimit()) | ||
| 235 | printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", | ||
| 236 | dev->name); | ||
| 237 | return -EOPNOTSUPP; | 279 | return -EOPNOTSUPP; |
| 238 | } | 280 | |
| 281 | /* add the conn now so that connection establishment has the dev */ | ||
| 282 | rds_ib_add_conn(rds_ibdev, conn); | ||
| 239 | 283 | ||
| 240 | if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) | 284 | if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) |
| 241 | rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); | 285 | rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); |
| @@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
| 306 | ic->i_send_ring.w_nr * | 350 | ic->i_send_ring.w_nr * |
| 307 | sizeof(struct rds_header), | 351 | sizeof(struct rds_header), |
| 308 | &ic->i_send_hdrs_dma, GFP_KERNEL); | 352 | &ic->i_send_hdrs_dma, GFP_KERNEL); |
| 309 | if (ic->i_send_hdrs == NULL) { | 353 | if (!ic->i_send_hdrs) { |
| 310 | ret = -ENOMEM; | 354 | ret = -ENOMEM; |
| 311 | rdsdebug("ib_dma_alloc_coherent send failed\n"); | 355 | rdsdebug("ib_dma_alloc_coherent send failed\n"); |
| 312 | goto out; | 356 | goto out; |
| @@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
| 316 | ic->i_recv_ring.w_nr * | 360 | ic->i_recv_ring.w_nr * |
| 317 | sizeof(struct rds_header), | 361 | sizeof(struct rds_header), |
| 318 | &ic->i_recv_hdrs_dma, GFP_KERNEL); | 362 | &ic->i_recv_hdrs_dma, GFP_KERNEL); |
| 319 | if (ic->i_recv_hdrs == NULL) { | 363 | if (!ic->i_recv_hdrs) { |
| 320 | ret = -ENOMEM; | 364 | ret = -ENOMEM; |
| 321 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); | 365 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); |
| 322 | goto out; | 366 | goto out; |
| @@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
| 324 | 368 | ||
| 325 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | 369 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), |
| 326 | &ic->i_ack_dma, GFP_KERNEL); | 370 | &ic->i_ack_dma, GFP_KERNEL); |
| 327 | if (ic->i_ack == NULL) { | 371 | if (!ic->i_ack) { |
| 328 | ret = -ENOMEM; | 372 | ret = -ENOMEM; |
| 329 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); | 373 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); |
| 330 | goto out; | 374 | goto out; |
| 331 | } | 375 | } |
| 332 | 376 | ||
| 333 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); | 377 | ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), |
| 334 | if (ic->i_sends == NULL) { | 378 | ibdev_to_node(dev)); |
| 379 | if (!ic->i_sends) { | ||
| 335 | ret = -ENOMEM; | 380 | ret = -ENOMEM; |
| 336 | rdsdebug("send allocation failed\n"); | 381 | rdsdebug("send allocation failed\n"); |
| 337 | goto out; | 382 | goto out; |
| 338 | } | 383 | } |
| 339 | memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); | 384 | memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); |
| 340 | 385 | ||
| 341 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); | 386 | ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), |
| 342 | if (ic->i_recvs == NULL) { | 387 | ibdev_to_node(dev)); |
| 388 | if (!ic->i_recvs) { | ||
| 343 | ret = -ENOMEM; | 389 | ret = -ENOMEM; |
| 344 | rdsdebug("recv allocation failed\n"); | 390 | rdsdebug("recv allocation failed\n"); |
| 345 | goto out; | 391 | goto out; |
| @@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
| 352 | ic->i_send_cq, ic->i_recv_cq); | 398 | ic->i_send_cq, ic->i_recv_cq); |
| 353 | 399 | ||
| 354 | out: | 400 | out: |
| 401 | rds_ib_dev_put(rds_ibdev); | ||
| 355 | return ret; | 402 | return ret; |
| 356 | } | 403 | } |
| 357 | 404 | ||
| @@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | |||
| 409 | struct rds_ib_connection *ic = NULL; | 456 | struct rds_ib_connection *ic = NULL; |
| 410 | struct rdma_conn_param conn_param; | 457 | struct rdma_conn_param conn_param; |
| 411 | u32 version; | 458 | u32 version; |
| 412 | int err, destroy = 1; | 459 | int err = 1, destroy = 1; |
| 413 | 460 | ||
| 414 | /* Check whether the remote protocol version matches ours. */ | 461 | /* Check whether the remote protocol version matches ours. */ |
| 415 | version = rds_ib_protocol_compatible(event); | 462 | version = rds_ib_protocol_compatible(event); |
| @@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | |||
| 448 | /* Wait and see - our connect may still be succeeding */ | 495 | /* Wait and see - our connect may still be succeeding */ |
| 449 | rds_ib_stats_inc(s_ib_connect_raced); | 496 | rds_ib_stats_inc(s_ib_connect_raced); |
| 450 | } | 497 | } |
| 451 | mutex_unlock(&conn->c_cm_lock); | ||
| 452 | goto out; | 498 | goto out; |
| 453 | } | 499 | } |
| 454 | 500 | ||
| @@ -479,20 +525,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | |||
| 479 | goto out; | 525 | goto out; |
| 480 | } | 526 | } |
| 481 | 527 | ||
| 482 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); | 528 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, |
| 529 | event->param.conn.responder_resources, | ||
| 530 | event->param.conn.initiator_depth); | ||
| 483 | 531 | ||
| 484 | /* rdma_accept() calls rdma_reject() internally if it fails */ | 532 | /* rdma_accept() calls rdma_reject() internally if it fails */ |
| 485 | err = rdma_accept(cm_id, &conn_param); | 533 | err = rdma_accept(cm_id, &conn_param); |
| 486 | mutex_unlock(&conn->c_cm_lock); | 534 | if (err) |
| 487 | if (err) { | ||
| 488 | rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); | 535 | rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); |
| 489 | goto out; | ||
| 490 | } | ||
| 491 | |||
| 492 | return 0; | ||
| 493 | 536 | ||
| 494 | out: | 537 | out: |
| 495 | rdma_reject(cm_id, NULL, 0); | 538 | if (conn) |
| 539 | mutex_unlock(&conn->c_cm_lock); | ||
| 540 | if (err) | ||
| 541 | rdma_reject(cm_id, NULL, 0); | ||
| 496 | return destroy; | 542 | return destroy; |
| 497 | } | 543 | } |
| 498 | 544 | ||
| @@ -516,8 +562,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) | |||
| 516 | goto out; | 562 | goto out; |
| 517 | } | 563 | } |
| 518 | 564 | ||
| 519 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); | 565 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, |
| 520 | 566 | UINT_MAX, UINT_MAX); | |
| 521 | ret = rdma_connect(cm_id, &conn_param); | 567 | ret = rdma_connect(cm_id, &conn_param); |
| 522 | if (ret) | 568 | if (ret) |
| 523 | rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); | 569 | rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); |
| @@ -601,9 +647,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) | |||
| 601 | ic->i_cm_id, err); | 647 | ic->i_cm_id, err); |
| 602 | } | 648 | } |
| 603 | 649 | ||
| 650 | /* | ||
| 651 | * We want to wait for tx and rx completion to finish | ||
| 652 | * before we tear down the connection, but we have to be | ||
| 653 | * careful not to get stuck waiting on a send ring that | ||
| 654 | * only has unsignaled sends in it. We've shutdown new | ||
| 655 | * sends before getting here so by waiting for signaled | ||
| 656 | * sends to complete we're ensured that there will be no | ||
| 657 | * more tx processing. | ||
| 658 | */ | ||
| 604 | wait_event(rds_ib_ring_empty_wait, | 659 | wait_event(rds_ib_ring_empty_wait, |
| 605 | rds_ib_ring_empty(&ic->i_send_ring) && | 660 | rds_ib_ring_empty(&ic->i_recv_ring) && |
| 606 | rds_ib_ring_empty(&ic->i_recv_ring)); | 661 | (atomic_read(&ic->i_signaled_sends) == 0)); |
| 662 | tasklet_kill(&ic->i_recv_tasklet); | ||
| 607 | 663 | ||
| 608 | if (ic->i_send_hdrs) | 664 | if (ic->i_send_hdrs) |
| 609 | ib_dma_free_coherent(dev, | 665 | ib_dma_free_coherent(dev, |
| @@ -654,9 +710,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) | |||
| 654 | BUG_ON(ic->rds_ibdev); | 710 | BUG_ON(ic->rds_ibdev); |
| 655 | 711 | ||
| 656 | /* Clear pending transmit */ | 712 | /* Clear pending transmit */ |
| 657 | if (ic->i_rm) { | 713 | if (ic->i_data_op) { |
| 658 | rds_message_put(ic->i_rm); | 714 | struct rds_message *rm; |
| 659 | ic->i_rm = NULL; | 715 | |
| 716 | rm = container_of(ic->i_data_op, struct rds_message, data); | ||
| 717 | rds_message_put(rm); | ||
| 718 | ic->i_data_op = NULL; | ||
| 660 | } | 719 | } |
| 661 | 720 | ||
| 662 | /* Clear the ACK state */ | 721 | /* Clear the ACK state */ |
| @@ -690,12 +749,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
| 690 | { | 749 | { |
| 691 | struct rds_ib_connection *ic; | 750 | struct rds_ib_connection *ic; |
| 692 | unsigned long flags; | 751 | unsigned long flags; |
| 752 | int ret; | ||
| 693 | 753 | ||
| 694 | /* XXX too lazy? */ | 754 | /* XXX too lazy? */ |
| 695 | ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); | 755 | ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); |
| 696 | if (ic == NULL) | 756 | if (!ic) |
| 697 | return -ENOMEM; | 757 | return -ENOMEM; |
| 698 | 758 | ||
| 759 | ret = rds_ib_recv_alloc_caches(ic); | ||
| 760 | if (ret) { | ||
| 761 | kfree(ic); | ||
| 762 | return ret; | ||
| 763 | } | ||
| 764 | |||
| 699 | INIT_LIST_HEAD(&ic->ib_node); | 765 | INIT_LIST_HEAD(&ic->ib_node); |
| 700 | tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, | 766 | tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, |
| 701 | (unsigned long) ic); | 767 | (unsigned long) ic); |
| @@ -703,6 +769,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
| 703 | #ifndef KERNEL_HAS_ATOMIC64 | 769 | #ifndef KERNEL_HAS_ATOMIC64 |
| 704 | spin_lock_init(&ic->i_ack_lock); | 770 | spin_lock_init(&ic->i_ack_lock); |
| 705 | #endif | 771 | #endif |
| 772 | atomic_set(&ic->i_signaled_sends, 0); | ||
| 706 | 773 | ||
| 707 | /* | 774 | /* |
| 708 | * rds_ib_conn_shutdown() waits for these to be emptied so they | 775 | * rds_ib_conn_shutdown() waits for these to be emptied so they |
| @@ -744,6 +811,8 @@ void rds_ib_conn_free(void *arg) | |||
| 744 | list_del(&ic->ib_node); | 811 | list_del(&ic->ib_node); |
| 745 | spin_unlock_irq(lock_ptr); | 812 | spin_unlock_irq(lock_ptr); |
| 746 | 813 | ||
| 814 | rds_ib_recv_free_caches(ic); | ||
| 815 | |||
| 747 | kfree(ic); | 816 | kfree(ic); |
| 748 | } | 817 | } |
| 749 | 818 | ||
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a54cd63f9e35..8f6e221c9f78 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c | |||
| @@ -32,11 +32,16 @@ | |||
| 32 | */ | 32 | */ |
| 33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
| 34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
| 35 | #include <linux/rculist.h> | ||
| 35 | 36 | ||
| 36 | #include "rds.h" | 37 | #include "rds.h" |
| 37 | #include "rdma.h" | ||
| 38 | #include "ib.h" | 38 | #include "ib.h" |
| 39 | #include "xlist.h" | ||
| 39 | 40 | ||
| 41 | struct workqueue_struct *rds_ib_fmr_wq; | ||
| 42 | |||
| 43 | static DEFINE_PER_CPU(unsigned long, clean_list_grace); | ||
| 44 | #define CLEAN_LIST_BUSY_BIT 0 | ||
| 40 | 45 | ||
| 41 | /* | 46 | /* |
| 42 | * This is stored as mr->r_trans_private. | 47 | * This is stored as mr->r_trans_private. |
| @@ -45,7 +50,11 @@ struct rds_ib_mr { | |||
| 45 | struct rds_ib_device *device; | 50 | struct rds_ib_device *device; |
| 46 | struct rds_ib_mr_pool *pool; | 51 | struct rds_ib_mr_pool *pool; |
| 47 | struct ib_fmr *fmr; | 52 | struct ib_fmr *fmr; |
| 48 | struct list_head list; | 53 | |
| 54 | struct xlist_head xlist; | ||
| 55 | |||
| 56 | /* unmap_list is for freeing */ | ||
| 57 | struct list_head unmap_list; | ||
| 49 | unsigned int remap_count; | 58 | unsigned int remap_count; |
| 50 | 59 | ||
| 51 | struct scatterlist *sg; | 60 | struct scatterlist *sg; |
| @@ -59,14 +68,16 @@ struct rds_ib_mr { | |||
| 59 | */ | 68 | */ |
| 60 | struct rds_ib_mr_pool { | 69 | struct rds_ib_mr_pool { |
| 61 | struct mutex flush_lock; /* serialize fmr invalidate */ | 70 | struct mutex flush_lock; /* serialize fmr invalidate */ |
| 62 | struct work_struct flush_worker; /* flush worker */ | 71 | struct delayed_work flush_worker; /* flush worker */ |
| 63 | 72 | ||
| 64 | spinlock_t list_lock; /* protect variables below */ | ||
| 65 | atomic_t item_count; /* total # of MRs */ | 73 | atomic_t item_count; /* total # of MRs */ |
| 66 | atomic_t dirty_count; /* # dirty of MRs */ | 74 | atomic_t dirty_count; /* # dirty of MRs */ |
| 67 | struct list_head drop_list; /* MRs that have reached their max_maps limit */ | 75 | |
| 68 | struct list_head free_list; /* unused MRs */ | 76 | struct xlist_head drop_list; /* MRs that have reached their max_maps limit */ |
| 69 | struct list_head clean_list; /* unused & unamapped MRs */ | 77 | struct xlist_head free_list; /* unused MRs */ |
| 78 | struct xlist_head clean_list; /* global unused & unamapped MRs */ | ||
| 79 | wait_queue_head_t flush_wait; | ||
| 80 | |||
| 70 | atomic_t free_pinned; /* memory pinned by free MRs */ | 81 | atomic_t free_pinned; /* memory pinned by free MRs */ |
| 71 | unsigned long max_items; | 82 | unsigned long max_items; |
| 72 | unsigned long max_items_soft; | 83 | unsigned long max_items_soft; |
| @@ -74,7 +85,7 @@ struct rds_ib_mr_pool { | |||
| 74 | struct ib_fmr_attr fmr_attr; | 85 | struct ib_fmr_attr fmr_attr; |
| 75 | }; | 86 | }; |
| 76 | 87 | ||
| 77 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); | 88 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); |
| 78 | static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); | 89 | static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); |
| 79 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work); | 90 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work); |
| 80 | 91 | ||
| @@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) | |||
| 83 | struct rds_ib_device *rds_ibdev; | 94 | struct rds_ib_device *rds_ibdev; |
| 84 | struct rds_ib_ipaddr *i_ipaddr; | 95 | struct rds_ib_ipaddr *i_ipaddr; |
| 85 | 96 | ||
| 86 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { | 97 | rcu_read_lock(); |
| 87 | spin_lock_irq(&rds_ibdev->spinlock); | 98 | list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { |
| 88 | list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { | 99 | list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { |
| 89 | if (i_ipaddr->ipaddr == ipaddr) { | 100 | if (i_ipaddr->ipaddr == ipaddr) { |
| 90 | spin_unlock_irq(&rds_ibdev->spinlock); | 101 | atomic_inc(&rds_ibdev->refcount); |
| 102 | rcu_read_unlock(); | ||
| 91 | return rds_ibdev; | 103 | return rds_ibdev; |
| 92 | } | 104 | } |
| 93 | } | 105 | } |
| 94 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
| 95 | } | 106 | } |
| 107 | rcu_read_unlock(); | ||
| 96 | 108 | ||
| 97 | return NULL; | 109 | return NULL; |
| 98 | } | 110 | } |
| @@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | |||
| 108 | i_ipaddr->ipaddr = ipaddr; | 120 | i_ipaddr->ipaddr = ipaddr; |
| 109 | 121 | ||
| 110 | spin_lock_irq(&rds_ibdev->spinlock); | 122 | spin_lock_irq(&rds_ibdev->spinlock); |
| 111 | list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); | 123 | list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); |
| 112 | spin_unlock_irq(&rds_ibdev->spinlock); | 124 | spin_unlock_irq(&rds_ibdev->spinlock); |
| 113 | 125 | ||
| 114 | return 0; | 126 | return 0; |
| @@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | |||
| 116 | 128 | ||
| 117 | static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | 129 | static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) |
| 118 | { | 130 | { |
| 119 | struct rds_ib_ipaddr *i_ipaddr, *next; | 131 | struct rds_ib_ipaddr *i_ipaddr; |
| 132 | struct rds_ib_ipaddr *to_free = NULL; | ||
| 133 | |||
| 120 | 134 | ||
| 121 | spin_lock_irq(&rds_ibdev->spinlock); | 135 | spin_lock_irq(&rds_ibdev->spinlock); |
| 122 | list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { | 136 | list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { |
| 123 | if (i_ipaddr->ipaddr == ipaddr) { | 137 | if (i_ipaddr->ipaddr == ipaddr) { |
| 124 | list_del(&i_ipaddr->list); | 138 | list_del_rcu(&i_ipaddr->list); |
| 125 | kfree(i_ipaddr); | 139 | to_free = i_ipaddr; |
| 126 | break; | 140 | break; |
| 127 | } | 141 | } |
| 128 | } | 142 | } |
| 129 | spin_unlock_irq(&rds_ibdev->spinlock); | 143 | spin_unlock_irq(&rds_ibdev->spinlock); |
| 144 | |||
| 145 | if (to_free) { | ||
| 146 | synchronize_rcu(); | ||
| 147 | kfree(to_free); | ||
| 148 | } | ||
| 130 | } | 149 | } |
| 131 | 150 | ||
| 132 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | 151 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) |
| @@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | |||
| 134 | struct rds_ib_device *rds_ibdev_old; | 153 | struct rds_ib_device *rds_ibdev_old; |
| 135 | 154 | ||
| 136 | rds_ibdev_old = rds_ib_get_device(ipaddr); | 155 | rds_ibdev_old = rds_ib_get_device(ipaddr); |
| 137 | if (rds_ibdev_old) | 156 | if (rds_ibdev_old) { |
| 138 | rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); | 157 | rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); |
| 158 | rds_ib_dev_put(rds_ibdev_old); | ||
| 159 | } | ||
| 139 | 160 | ||
| 140 | return rds_ib_add_ipaddr(rds_ibdev, ipaddr); | 161 | return rds_ib_add_ipaddr(rds_ibdev, ipaddr); |
| 141 | } | 162 | } |
| @@ -156,6 +177,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con | |||
| 156 | spin_unlock_irq(&ib_nodev_conns_lock); | 177 | spin_unlock_irq(&ib_nodev_conns_lock); |
| 157 | 178 | ||
| 158 | ic->rds_ibdev = rds_ibdev; | 179 | ic->rds_ibdev = rds_ibdev; |
| 180 | atomic_inc(&rds_ibdev->refcount); | ||
| 159 | } | 181 | } |
| 160 | 182 | ||
| 161 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) | 183 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) |
| @@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * | |||
| 175 | spin_unlock(&ib_nodev_conns_lock); | 197 | spin_unlock(&ib_nodev_conns_lock); |
| 176 | 198 | ||
| 177 | ic->rds_ibdev = NULL; | 199 | ic->rds_ibdev = NULL; |
| 200 | rds_ib_dev_put(rds_ibdev); | ||
| 178 | } | 201 | } |
| 179 | 202 | ||
| 180 | void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) | 203 | void rds_ib_destroy_nodev_conns(void) |
| 181 | { | 204 | { |
| 182 | struct rds_ib_connection *ic, *_ic; | 205 | struct rds_ib_connection *ic, *_ic; |
| 183 | LIST_HEAD(tmp_list); | 206 | LIST_HEAD(tmp_list); |
| 184 | 207 | ||
| 185 | /* avoid calling conn_destroy with irqs off */ | 208 | /* avoid calling conn_destroy with irqs off */ |
| 186 | spin_lock_irq(list_lock); | 209 | spin_lock_irq(&ib_nodev_conns_lock); |
| 187 | list_splice(list, &tmp_list); | 210 | list_splice(&ib_nodev_conns, &tmp_list); |
| 188 | INIT_LIST_HEAD(list); | 211 | spin_unlock_irq(&ib_nodev_conns_lock); |
| 189 | spin_unlock_irq(list_lock); | ||
| 190 | 212 | ||
| 191 | list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) | 213 | list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) |
| 192 | rds_conn_destroy(ic->conn); | 214 | rds_conn_destroy(ic->conn); |
| @@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) | |||
| 200 | if (!pool) | 222 | if (!pool) |
| 201 | return ERR_PTR(-ENOMEM); | 223 | return ERR_PTR(-ENOMEM); |
| 202 | 224 | ||
| 203 | INIT_LIST_HEAD(&pool->free_list); | 225 | INIT_XLIST_HEAD(&pool->free_list); |
| 204 | INIT_LIST_HEAD(&pool->drop_list); | 226 | INIT_XLIST_HEAD(&pool->drop_list); |
| 205 | INIT_LIST_HEAD(&pool->clean_list); | 227 | INIT_XLIST_HEAD(&pool->clean_list); |
| 206 | mutex_init(&pool->flush_lock); | 228 | mutex_init(&pool->flush_lock); |
| 207 | spin_lock_init(&pool->list_lock); | 229 | init_waitqueue_head(&pool->flush_wait); |
| 208 | INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); | 230 | INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); |
| 209 | 231 | ||
| 210 | pool->fmr_attr.max_pages = fmr_message_size; | 232 | pool->fmr_attr.max_pages = fmr_message_size; |
| 211 | pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; | 233 | pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; |
| @@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co | |||
| 233 | 255 | ||
| 234 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) | 256 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) |
| 235 | { | 257 | { |
| 236 | flush_workqueue(rds_wq); | 258 | cancel_delayed_work_sync(&pool->flush_worker); |
| 237 | rds_ib_flush_mr_pool(pool, 1); | 259 | rds_ib_flush_mr_pool(pool, 1, NULL); |
| 238 | WARN_ON(atomic_read(&pool->item_count)); | 260 | WARN_ON(atomic_read(&pool->item_count)); |
| 239 | WARN_ON(atomic_read(&pool->free_pinned)); | 261 | WARN_ON(atomic_read(&pool->free_pinned)); |
| 240 | kfree(pool); | 262 | kfree(pool); |
| 241 | } | 263 | } |
| 242 | 264 | ||
| 265 | static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl, | ||
| 266 | struct rds_ib_mr **ibmr_ret) | ||
| 267 | { | ||
| 268 | struct xlist_head *ibmr_xl; | ||
| 269 | ibmr_xl = xlist_del_head_fast(xl); | ||
| 270 | *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist); | ||
| 271 | } | ||
| 272 | |||
| 243 | static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) | 273 | static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) |
| 244 | { | 274 | { |
| 245 | struct rds_ib_mr *ibmr = NULL; | 275 | struct rds_ib_mr *ibmr = NULL; |
| 246 | unsigned long flags; | 276 | struct xlist_head *ret; |
| 277 | unsigned long *flag; | ||
| 247 | 278 | ||
| 248 | spin_lock_irqsave(&pool->list_lock, flags); | 279 | preempt_disable(); |
| 249 | if (!list_empty(&pool->clean_list)) { | 280 | flag = &__get_cpu_var(clean_list_grace); |
| 250 | ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); | 281 | set_bit(CLEAN_LIST_BUSY_BIT, flag); |
| 251 | list_del_init(&ibmr->list); | 282 | ret = xlist_del_head(&pool->clean_list); |
| 252 | } | 283 | if (ret) |
| 253 | spin_unlock_irqrestore(&pool->list_lock, flags); | 284 | ibmr = list_entry(ret, struct rds_ib_mr, xlist); |
| 254 | 285 | ||
| 286 | clear_bit(CLEAN_LIST_BUSY_BIT, flag); | ||
| 287 | preempt_enable(); | ||
| 255 | return ibmr; | 288 | return ibmr; |
| 256 | } | 289 | } |
| 257 | 290 | ||
| 291 | static inline void wait_clean_list_grace(void) | ||
| 292 | { | ||
| 293 | int cpu; | ||
| 294 | unsigned long *flag; | ||
| 295 | |||
| 296 | for_each_online_cpu(cpu) { | ||
| 297 | flag = &per_cpu(clean_list_grace, cpu); | ||
| 298 | while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) | ||
| 299 | cpu_relax(); | ||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 258 | static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) | 303 | static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) |
| 259 | { | 304 | { |
| 260 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | 305 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; |
| 261 | struct rds_ib_mr *ibmr = NULL; | 306 | struct rds_ib_mr *ibmr = NULL; |
| 262 | int err = 0, iter = 0; | 307 | int err = 0, iter = 0; |
| 263 | 308 | ||
| 309 | if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) | ||
| 310 | queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); | ||
| 311 | |||
| 264 | while (1) { | 312 | while (1) { |
| 265 | ibmr = rds_ib_reuse_fmr(pool); | 313 | ibmr = rds_ib_reuse_fmr(pool); |
| 266 | if (ibmr) | 314 | if (ibmr) |
| @@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) | |||
| 287 | 335 | ||
| 288 | /* We do have some empty MRs. Flush them out. */ | 336 | /* We do have some empty MRs. Flush them out. */ |
| 289 | rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); | 337 | rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); |
| 290 | rds_ib_flush_mr_pool(pool, 0); | 338 | rds_ib_flush_mr_pool(pool, 0, &ibmr); |
| 339 | if (ibmr) | ||
| 340 | return ibmr; | ||
| 291 | } | 341 | } |
| 292 | 342 | ||
| 293 | ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); | 343 | ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); |
| 294 | if (!ibmr) { | 344 | if (!ibmr) { |
| 295 | err = -ENOMEM; | 345 | err = -ENOMEM; |
| 296 | goto out_no_cigar; | 346 | goto out_no_cigar; |
| 297 | } | 347 | } |
| 298 | 348 | ||
| 349 | memset(ibmr, 0, sizeof(*ibmr)); | ||
| 350 | |||
| 299 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, | 351 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, |
| 300 | (IB_ACCESS_LOCAL_WRITE | | 352 | (IB_ACCESS_LOCAL_WRITE | |
| 301 | IB_ACCESS_REMOTE_READ | | 353 | IB_ACCESS_REMOTE_READ | |
| 302 | IB_ACCESS_REMOTE_WRITE), | 354 | IB_ACCESS_REMOTE_WRITE| |
| 355 | IB_ACCESS_REMOTE_ATOMIC), | ||
| 303 | &pool->fmr_attr); | 356 | &pool->fmr_attr); |
| 304 | if (IS_ERR(ibmr->fmr)) { | 357 | if (IS_ERR(ibmr->fmr)) { |
| 305 | err = PTR_ERR(ibmr->fmr); | 358 | err = PTR_ERR(ibmr->fmr); |
| @@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm | |||
| 367 | if (page_cnt > fmr_message_size) | 420 | if (page_cnt > fmr_message_size) |
| 368 | return -EINVAL; | 421 | return -EINVAL; |
| 369 | 422 | ||
| 370 | dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); | 423 | dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, |
| 424 | rdsibdev_to_node(rds_ibdev)); | ||
| 371 | if (!dma_pages) | 425 | if (!dma_pages) |
| 372 | return -ENOMEM; | 426 | return -ENOMEM; |
| 373 | 427 | ||
| @@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) | |||
| 441 | 495 | ||
| 442 | /* FIXME we need a way to tell a r/w MR | 496 | /* FIXME we need a way to tell a r/w MR |
| 443 | * from a r/o MR */ | 497 | * from a r/o MR */ |
| 444 | BUG_ON(in_interrupt()); | 498 | BUG_ON(irqs_disabled()); |
| 445 | set_page_dirty(page); | 499 | set_page_dirty(page); |
| 446 | put_page(page); | 500 | put_page(page); |
| 447 | } | 501 | } |
| @@ -477,33 +531,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr | |||
| 477 | } | 531 | } |
| 478 | 532 | ||
| 479 | /* | 533 | /* |
| 534 | * given an xlist of mrs, put them all into the list_head for more processing | ||
| 535 | */ | ||
| 536 | static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list) | ||
| 537 | { | ||
| 538 | struct rds_ib_mr *ibmr; | ||
| 539 | struct xlist_head splice; | ||
| 540 | struct xlist_head *cur; | ||
| 541 | struct xlist_head *next; | ||
| 542 | |||
| 543 | splice.next = NULL; | ||
| 544 | xlist_splice(xlist, &splice); | ||
| 545 | cur = splice.next; | ||
| 546 | while (cur) { | ||
| 547 | next = cur->next; | ||
| 548 | ibmr = list_entry(cur, struct rds_ib_mr, xlist); | ||
| 549 | list_add_tail(&ibmr->unmap_list, list); | ||
| 550 | cur = next; | ||
| 551 | } | ||
| 552 | } | ||
| 553 | |||
| 554 | /* | ||
| 555 | * this takes a list head of mrs and turns it into an xlist of clusters. | ||
| 556 | * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for | ||
| 557 | * reuse. | ||
| 558 | */ | ||
| 559 | static void list_append_to_xlist(struct rds_ib_mr_pool *pool, | ||
| 560 | struct list_head *list, struct xlist_head *xlist, | ||
| 561 | struct xlist_head **tail_ret) | ||
| 562 | { | ||
| 563 | struct rds_ib_mr *ibmr; | ||
| 564 | struct xlist_head *cur_mr = xlist; | ||
| 565 | struct xlist_head *tail_mr = NULL; | ||
| 566 | |||
| 567 | list_for_each_entry(ibmr, list, unmap_list) { | ||
| 568 | tail_mr = &ibmr->xlist; | ||
| 569 | tail_mr->next = NULL; | ||
| 570 | cur_mr->next = tail_mr; | ||
| 571 | cur_mr = tail_mr; | ||
| 572 | } | ||
| 573 | *tail_ret = tail_mr; | ||
| 574 | } | ||
| 575 | |||
| 576 | /* | ||
| 480 | * Flush our pool of MRs. | 577 | * Flush our pool of MRs. |
| 481 | * At a minimum, all currently unused MRs are unmapped. | 578 | * At a minimum, all currently unused MRs are unmapped. |
| 482 | * If the number of MRs allocated exceeds the limit, we also try | 579 | * If the number of MRs allocated exceeds the limit, we also try |
| 483 | * to free as many MRs as needed to get back to this limit. | 580 | * to free as many MRs as needed to get back to this limit. |
| 484 | */ | 581 | */ |
| 485 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | 582 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, |
| 583 | int free_all, struct rds_ib_mr **ibmr_ret) | ||
| 486 | { | 584 | { |
| 487 | struct rds_ib_mr *ibmr, *next; | 585 | struct rds_ib_mr *ibmr, *next; |
| 586 | struct xlist_head clean_xlist; | ||
| 587 | struct xlist_head *clean_tail; | ||
| 488 | LIST_HEAD(unmap_list); | 588 | LIST_HEAD(unmap_list); |
| 489 | LIST_HEAD(fmr_list); | 589 | LIST_HEAD(fmr_list); |
| 490 | unsigned long unpinned = 0; | 590 | unsigned long unpinned = 0; |
| 491 | unsigned long flags; | ||
| 492 | unsigned int nfreed = 0, ncleaned = 0, free_goal; | 591 | unsigned int nfreed = 0, ncleaned = 0, free_goal; |
| 493 | int ret = 0; | 592 | int ret = 0; |
| 494 | 593 | ||
| 495 | rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); | 594 | rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); |
| 496 | 595 | ||
| 497 | mutex_lock(&pool->flush_lock); | 596 | if (ibmr_ret) { |
| 597 | DEFINE_WAIT(wait); | ||
| 598 | while(!mutex_trylock(&pool->flush_lock)) { | ||
| 599 | ibmr = rds_ib_reuse_fmr(pool); | ||
| 600 | if (ibmr) { | ||
| 601 | *ibmr_ret = ibmr; | ||
| 602 | finish_wait(&pool->flush_wait, &wait); | ||
| 603 | goto out_nolock; | ||
| 604 | } | ||
| 605 | |||
| 606 | prepare_to_wait(&pool->flush_wait, &wait, | ||
| 607 | TASK_UNINTERRUPTIBLE); | ||
| 608 | if (xlist_empty(&pool->clean_list)) | ||
| 609 | schedule(); | ||
| 610 | |||
| 611 | ibmr = rds_ib_reuse_fmr(pool); | ||
| 612 | if (ibmr) { | ||
| 613 | *ibmr_ret = ibmr; | ||
| 614 | finish_wait(&pool->flush_wait, &wait); | ||
| 615 | goto out_nolock; | ||
| 616 | } | ||
| 617 | } | ||
| 618 | finish_wait(&pool->flush_wait, &wait); | ||
| 619 | } else | ||
| 620 | mutex_lock(&pool->flush_lock); | ||
| 621 | |||
| 622 | if (ibmr_ret) { | ||
| 623 | ibmr = rds_ib_reuse_fmr(pool); | ||
| 624 | if (ibmr) { | ||
| 625 | *ibmr_ret = ibmr; | ||
| 626 | goto out; | ||
| 627 | } | ||
| 628 | } | ||
| 498 | 629 | ||
| 499 | spin_lock_irqsave(&pool->list_lock, flags); | ||
| 500 | /* Get the list of all MRs to be dropped. Ordering matters - | 630 | /* Get the list of all MRs to be dropped. Ordering matters - |
| 501 | * we want to put drop_list ahead of free_list. */ | 631 | * we want to put drop_list ahead of free_list. |
| 502 | list_splice_init(&pool->free_list, &unmap_list); | 632 | */ |
| 503 | list_splice_init(&pool->drop_list, &unmap_list); | 633 | xlist_append_to_list(&pool->drop_list, &unmap_list); |
| 634 | xlist_append_to_list(&pool->free_list, &unmap_list); | ||
| 504 | if (free_all) | 635 | if (free_all) |
| 505 | list_splice_init(&pool->clean_list, &unmap_list); | 636 | xlist_append_to_list(&pool->clean_list, &unmap_list); |
| 506 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
| 507 | 637 | ||
| 508 | free_goal = rds_ib_flush_goal(pool, free_all); | 638 | free_goal = rds_ib_flush_goal(pool, free_all); |
| 509 | 639 | ||
| @@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | |||
| 511 | goto out; | 641 | goto out; |
| 512 | 642 | ||
| 513 | /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ | 643 | /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ |
| 514 | list_for_each_entry(ibmr, &unmap_list, list) | 644 | list_for_each_entry(ibmr, &unmap_list, unmap_list) |
| 515 | list_add(&ibmr->fmr->list, &fmr_list); | 645 | list_add(&ibmr->fmr->list, &fmr_list); |
| 646 | |||
| 516 | ret = ib_unmap_fmr(&fmr_list); | 647 | ret = ib_unmap_fmr(&fmr_list); |
| 517 | if (ret) | 648 | if (ret) |
| 518 | printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); | 649 | printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); |
| 519 | 650 | ||
| 520 | /* Now we can destroy the DMA mapping and unpin any pages */ | 651 | /* Now we can destroy the DMA mapping and unpin any pages */ |
| 521 | list_for_each_entry_safe(ibmr, next, &unmap_list, list) { | 652 | list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { |
| 522 | unpinned += ibmr->sg_len; | 653 | unpinned += ibmr->sg_len; |
| 523 | __rds_ib_teardown_mr(ibmr); | 654 | __rds_ib_teardown_mr(ibmr); |
| 524 | if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { | 655 | if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { |
| 525 | rds_ib_stats_inc(s_ib_rdma_mr_free); | 656 | rds_ib_stats_inc(s_ib_rdma_mr_free); |
| 526 | list_del(&ibmr->list); | 657 | list_del(&ibmr->unmap_list); |
| 527 | ib_dealloc_fmr(ibmr->fmr); | 658 | ib_dealloc_fmr(ibmr->fmr); |
| 528 | kfree(ibmr); | 659 | kfree(ibmr); |
| 529 | nfreed++; | 660 | nfreed++; |
| @@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | |||
| 531 | ncleaned++; | 662 | ncleaned++; |
| 532 | } | 663 | } |
| 533 | 664 | ||
| 534 | spin_lock_irqsave(&pool->list_lock, flags); | 665 | if (!list_empty(&unmap_list)) { |
| 535 | list_splice(&unmap_list, &pool->clean_list); | 666 | /* we have to make sure that none of the things we're about |
| 536 | spin_unlock_irqrestore(&pool->list_lock, flags); | 667 | * to put on the clean list would race with other cpus trying |
| 668 | * to pull items off. The xlist would explode if we managed to | ||
| 669 | * remove something from the clean list and then add it back again | ||
| 670 | * while another CPU was spinning on that same item in xlist_del_head. | ||
| 671 | * | ||
| 672 | * This is pretty unlikely, but just in case wait for an xlist grace period | ||
| 673 | * here before adding anything back into the clean list. | ||
| 674 | */ | ||
| 675 | wait_clean_list_grace(); | ||
| 676 | |||
| 677 | list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail); | ||
| 678 | if (ibmr_ret) | ||
| 679 | refill_local(pool, &clean_xlist, ibmr_ret); | ||
| 680 | |||
| 681 | /* refill_local may have emptied our list */ | ||
| 682 | if (!xlist_empty(&clean_xlist)) | ||
| 683 | xlist_add(clean_xlist.next, clean_tail, &pool->clean_list); | ||
| 684 | |||
| 685 | } | ||
| 537 | 686 | ||
| 538 | atomic_sub(unpinned, &pool->free_pinned); | 687 | atomic_sub(unpinned, &pool->free_pinned); |
| 539 | atomic_sub(ncleaned, &pool->dirty_count); | 688 | atomic_sub(ncleaned, &pool->dirty_count); |
| @@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | |||
| 541 | 690 | ||
| 542 | out: | 691 | out: |
| 543 | mutex_unlock(&pool->flush_lock); | 692 | mutex_unlock(&pool->flush_lock); |
| 693 | if (waitqueue_active(&pool->flush_wait)) | ||
| 694 | wake_up(&pool->flush_wait); | ||
| 695 | out_nolock: | ||
| 544 | return ret; | 696 | return ret; |
| 545 | } | 697 | } |
| 546 | 698 | ||
| 699 | int rds_ib_fmr_init(void) | ||
| 700 | { | ||
| 701 | rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); | ||
| 702 | if (!rds_ib_fmr_wq) | ||
| 703 | return -ENOMEM; | ||
| 704 | return 0; | ||
| 705 | } | ||
| 706 | |||
| 707 | /* | ||
| 708 | * By the time this is called all the IB devices should have been torn down and | ||
| 709 | * had their pools freed. As each pool is freed its work struct is waited on, | ||
| 710 | * so the pool flushing work queue should be idle by the time we get here. | ||
| 711 | */ | ||
| 712 | void rds_ib_fmr_exit(void) | ||
| 713 | { | ||
| 714 | destroy_workqueue(rds_ib_fmr_wq); | ||
| 715 | } | ||
| 716 | |||
| 547 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work) | 717 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work) |
| 548 | { | 718 | { |
| 549 | struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); | 719 | struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); |
| 550 | 720 | ||
| 551 | rds_ib_flush_mr_pool(pool, 0); | 721 | rds_ib_flush_mr_pool(pool, 0, NULL); |
| 552 | } | 722 | } |
| 553 | 723 | ||
| 554 | void rds_ib_free_mr(void *trans_private, int invalidate) | 724 | void rds_ib_free_mr(void *trans_private, int invalidate) |
| @@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate) | |||
| 556 | struct rds_ib_mr *ibmr = trans_private; | 726 | struct rds_ib_mr *ibmr = trans_private; |
| 557 | struct rds_ib_device *rds_ibdev = ibmr->device; | 727 | struct rds_ib_device *rds_ibdev = ibmr->device; |
| 558 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | 728 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; |
| 559 | unsigned long flags; | ||
| 560 | 729 | ||
| 561 | rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); | 730 | rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); |
| 562 | 731 | ||
| 563 | /* Return it to the pool's free list */ | 732 | /* Return it to the pool's free list */ |
| 564 | spin_lock_irqsave(&pool->list_lock, flags); | ||
| 565 | if (ibmr->remap_count >= pool->fmr_attr.max_maps) | 733 | if (ibmr->remap_count >= pool->fmr_attr.max_maps) |
| 566 | list_add(&ibmr->list, &pool->drop_list); | 734 | xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list); |
| 567 | else | 735 | else |
| 568 | list_add(&ibmr->list, &pool->free_list); | 736 | xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list); |
| 569 | 737 | ||
| 570 | atomic_add(ibmr->sg_len, &pool->free_pinned); | 738 | atomic_add(ibmr->sg_len, &pool->free_pinned); |
| 571 | atomic_inc(&pool->dirty_count); | 739 | atomic_inc(&pool->dirty_count); |
| 572 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
| 573 | 740 | ||
| 574 | /* If we've pinned too many pages, request a flush */ | 741 | /* If we've pinned too many pages, request a flush */ |
| 575 | if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || | 742 | if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || |
| 576 | atomic_read(&pool->dirty_count) >= pool->max_items / 10) | 743 | atomic_read(&pool->dirty_count) >= pool->max_items / 10) |
| 577 | queue_work(rds_wq, &pool->flush_worker); | 744 | queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); |
| 578 | 745 | ||
| 579 | if (invalidate) { | 746 | if (invalidate) { |
| 580 | if (likely(!in_interrupt())) { | 747 | if (likely(!in_interrupt())) { |
| 581 | rds_ib_flush_mr_pool(pool, 0); | 748 | rds_ib_flush_mr_pool(pool, 0, NULL); |
| 582 | } else { | 749 | } else { |
| 583 | /* We get here if the user created a MR marked | 750 | /* We get here if the user created a MR marked |
| 584 | * as use_once and invalidate at the same time. */ | 751 | * as use_once and invalidate at the same time. */ |
| 585 | queue_work(rds_wq, &pool->flush_worker); | 752 | queue_delayed_work(rds_ib_fmr_wq, |
| 753 | &pool->flush_worker, 10); | ||
| 586 | } | 754 | } |
| 587 | } | 755 | } |
| 756 | |||
| 757 | rds_ib_dev_put(rds_ibdev); | ||
| 588 | } | 758 | } |
| 589 | 759 | ||
| 590 | void rds_ib_flush_mrs(void) | 760 | void rds_ib_flush_mrs(void) |
| 591 | { | 761 | { |
| 592 | struct rds_ib_device *rds_ibdev; | 762 | struct rds_ib_device *rds_ibdev; |
| 593 | 763 | ||
| 764 | down_read(&rds_ib_devices_lock); | ||
| 594 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { | 765 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { |
| 595 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | 766 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; |
| 596 | 767 | ||
| 597 | if (pool) | 768 | if (pool) |
| 598 | rds_ib_flush_mr_pool(pool, 0); | 769 | rds_ib_flush_mr_pool(pool, 0, NULL); |
| 599 | } | 770 | } |
| 771 | up_read(&rds_ib_devices_lock); | ||
| 600 | } | 772 | } |
| 601 | 773 | ||
| 602 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | 774 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, |
| @@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | |||
| 628 | printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); | 800 | printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); |
| 629 | 801 | ||
| 630 | ibmr->device = rds_ibdev; | 802 | ibmr->device = rds_ibdev; |
| 803 | rds_ibdev = NULL; | ||
| 631 | 804 | ||
| 632 | out: | 805 | out: |
| 633 | if (ret) { | 806 | if (ret) { |
| @@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | |||
| 635 | rds_ib_free_mr(ibmr, 0); | 808 | rds_ib_free_mr(ibmr, 0); |
| 636 | ibmr = ERR_PTR(ret); | 809 | ibmr = ERR_PTR(ret); |
| 637 | } | 810 | } |
| 811 | if (rds_ibdev) | ||
| 812 | rds_ib_dev_put(rds_ibdev); | ||
| 638 | return ibmr; | 813 | return ibmr; |
| 639 | } | 814 | } |
| 815 | |||
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index c74e9904a6b2..e29e0ca32f74 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c | |||
| @@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; | |||
| 43 | static struct kmem_cache *rds_ib_frag_slab; | 43 | static struct kmem_cache *rds_ib_frag_slab; |
| 44 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); | 44 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); |
| 45 | 45 | ||
| 46 | static void rds_ib_frag_drop_page(struct rds_page_frag *frag) | ||
| 47 | { | ||
| 48 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
| 49 | __free_page(frag->f_page); | ||
| 50 | frag->f_page = NULL; | ||
| 51 | } | ||
| 52 | |||
| 53 | static void rds_ib_frag_free(struct rds_page_frag *frag) | ||
| 54 | { | ||
| 55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
| 56 | BUG_ON(frag->f_page != NULL); | ||
| 57 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
| 58 | } | ||
| 59 | |||
| 60 | /* | ||
| 61 | * We map a page at a time. Its fragments are posted in order. This | ||
| 62 | * is called in fragment order as the fragments get send completion events. | ||
| 63 | * Only the last frag in the page performs the unmapping. | ||
| 64 | * | ||
| 65 | * It's OK for ring cleanup to call this in whatever order it likes because | ||
| 66 | * DMA is not in flight and so we can unmap while other ring entries still | ||
| 67 | * hold page references in their frags. | ||
| 68 | */ | ||
| 69 | static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, | ||
| 70 | struct rds_ib_recv_work *recv) | ||
| 71 | { | ||
| 72 | struct rds_page_frag *frag = recv->r_frag; | ||
| 73 | |||
| 74 | rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); | ||
| 75 | if (frag->f_mapped) | ||
| 76 | ib_dma_unmap_page(ic->i_cm_id->device, | ||
| 77 | frag->f_mapped, | ||
| 78 | RDS_FRAG_SIZE, DMA_FROM_DEVICE); | ||
| 79 | frag->f_mapped = 0; | ||
| 80 | } | ||
| 81 | |||
| 82 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | 46 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) |
| 83 | { | 47 | { |
| 84 | struct rds_ib_recv_work *recv; | 48 | struct rds_ib_recv_work *recv; |
| @@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | |||
| 95 | recv->r_wr.sg_list = recv->r_sge; | 59 | recv->r_wr.sg_list = recv->r_sge; |
| 96 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; | 60 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; |
| 97 | 61 | ||
| 98 | sge = rds_ib_data_sge(ic, recv->r_sge); | 62 | sge = &recv->r_sge[0]; |
| 63 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | ||
| 64 | sge->length = sizeof(struct rds_header); | ||
| 65 | sge->lkey = ic->i_mr->lkey; | ||
| 66 | |||
| 67 | sge = &recv->r_sge[1]; | ||
| 99 | sge->addr = 0; | 68 | sge->addr = 0; |
| 100 | sge->length = RDS_FRAG_SIZE; | 69 | sge->length = RDS_FRAG_SIZE; |
| 101 | sge->lkey = ic->i_mr->lkey; | 70 | sge->lkey = ic->i_mr->lkey; |
| 71 | } | ||
| 72 | } | ||
| 102 | 73 | ||
| 103 | sge = rds_ib_header_sge(ic, recv->r_sge); | 74 | /* |
| 104 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | 75 | * The entire 'from' list, including the from element itself, is put on |
| 105 | sge->length = sizeof(struct rds_header); | 76 | * to the tail of the 'to' list. |
| 106 | sge->lkey = ic->i_mr->lkey; | 77 | */ |
| 78 | static void list_splice_entire_tail(struct list_head *from, | ||
| 79 | struct list_head *to) | ||
| 80 | { | ||
| 81 | struct list_head *from_last = from->prev; | ||
| 82 | |||
| 83 | list_splice_tail(from_last, to); | ||
| 84 | list_add_tail(from_last, to); | ||
| 85 | } | ||
| 86 | |||
| 87 | static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) | ||
| 88 | { | ||
| 89 | struct list_head *tmp; | ||
| 90 | |||
| 91 | tmp = xchg(&cache->xfer, NULL); | ||
| 92 | if (tmp) { | ||
| 93 | if (cache->ready) | ||
| 94 | list_splice_entire_tail(tmp, cache->ready); | ||
| 95 | else | ||
| 96 | cache->ready = tmp; | ||
| 97 | } | ||
| 98 | } | ||
| 99 | |||
| 100 | static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) | ||
| 101 | { | ||
| 102 | struct rds_ib_cache_head *head; | ||
| 103 | int cpu; | ||
| 104 | |||
| 105 | cache->percpu = alloc_percpu(struct rds_ib_cache_head); | ||
| 106 | if (!cache->percpu) | ||
| 107 | return -ENOMEM; | ||
| 108 | |||
| 109 | for_each_possible_cpu(cpu) { | ||
| 110 | head = per_cpu_ptr(cache->percpu, cpu); | ||
| 111 | head->first = NULL; | ||
| 112 | head->count = 0; | ||
| 113 | } | ||
| 114 | cache->xfer = NULL; | ||
| 115 | cache->ready = NULL; | ||
| 116 | |||
| 117 | return 0; | ||
| 118 | } | ||
| 119 | |||
| 120 | int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) | ||
| 121 | { | ||
| 122 | int ret; | ||
| 123 | |||
| 124 | ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); | ||
| 125 | if (!ret) { | ||
| 126 | ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); | ||
| 127 | if (ret) | ||
| 128 | free_percpu(ic->i_cache_incs.percpu); | ||
| 107 | } | 129 | } |
| 130 | |||
| 131 | return ret; | ||
| 132 | } | ||
| 133 | |||
| 134 | static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, | ||
| 135 | struct list_head *caller_list) | ||
| 136 | { | ||
| 137 | struct rds_ib_cache_head *head; | ||
| 138 | int cpu; | ||
| 139 | |||
| 140 | for_each_possible_cpu(cpu) { | ||
| 141 | head = per_cpu_ptr(cache->percpu, cpu); | ||
| 142 | if (head->first) { | ||
| 143 | list_splice_entire_tail(head->first, caller_list); | ||
| 144 | head->first = NULL; | ||
| 145 | } | ||
| 146 | } | ||
| 147 | |||
| 148 | if (cache->ready) { | ||
| 149 | list_splice_entire_tail(cache->ready, caller_list); | ||
| 150 | cache->ready = NULL; | ||
| 151 | } | ||
| 152 | } | ||
| 153 | |||
| 154 | void rds_ib_recv_free_caches(struct rds_ib_connection *ic) | ||
| 155 | { | ||
| 156 | struct rds_ib_incoming *inc; | ||
| 157 | struct rds_ib_incoming *inc_tmp; | ||
| 158 | struct rds_page_frag *frag; | ||
| 159 | struct rds_page_frag *frag_tmp; | ||
| 160 | LIST_HEAD(list); | ||
| 161 | |||
| 162 | rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); | ||
| 163 | rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); | ||
| 164 | free_percpu(ic->i_cache_incs.percpu); | ||
| 165 | |||
| 166 | list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { | ||
| 167 | list_del(&inc->ii_cache_entry); | ||
| 168 | WARN_ON(!list_empty(&inc->ii_frags)); | ||
| 169 | kmem_cache_free(rds_ib_incoming_slab, inc); | ||
| 170 | } | ||
| 171 | |||
| 172 | rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); | ||
| 173 | rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); | ||
| 174 | free_percpu(ic->i_cache_frags.percpu); | ||
| 175 | |||
| 176 | list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { | ||
| 177 | list_del(&frag->f_cache_entry); | ||
| 178 | WARN_ON(!list_empty(&frag->f_item)); | ||
| 179 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
| 180 | } | ||
| 181 | } | ||
| 182 | |||
| 183 | /* fwd decl */ | ||
| 184 | static void rds_ib_recv_cache_put(struct list_head *new_item, | ||
| 185 | struct rds_ib_refill_cache *cache); | ||
| 186 | static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); | ||
| 187 | |||
| 188 | |||
| 189 | /* Recycle frag and attached recv buffer f_sg */ | ||
| 190 | static void rds_ib_frag_free(struct rds_ib_connection *ic, | ||
| 191 | struct rds_page_frag *frag) | ||
| 192 | { | ||
| 193 | rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); | ||
| 194 | |||
| 195 | rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); | ||
| 196 | } | ||
| 197 | |||
| 198 | /* Recycle inc after freeing attached frags */ | ||
| 199 | void rds_ib_inc_free(struct rds_incoming *inc) | ||
| 200 | { | ||
| 201 | struct rds_ib_incoming *ibinc; | ||
| 202 | struct rds_page_frag *frag; | ||
| 203 | struct rds_page_frag *pos; | ||
| 204 | struct rds_ib_connection *ic = inc->i_conn->c_transport_data; | ||
| 205 | |||
| 206 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | ||
| 207 | |||
| 208 | /* Free attached frags */ | ||
| 209 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | ||
| 210 | list_del_init(&frag->f_item); | ||
| 211 | rds_ib_frag_free(ic, frag); | ||
| 212 | } | ||
| 213 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
| 214 | |||
| 215 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
| 216 | rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); | ||
| 108 | } | 217 | } |
| 109 | 218 | ||
| 110 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | 219 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, |
| @@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | |||
| 115 | recv->r_ibinc = NULL; | 224 | recv->r_ibinc = NULL; |
| 116 | } | 225 | } |
| 117 | if (recv->r_frag) { | 226 | if (recv->r_frag) { |
| 118 | rds_ib_recv_unmap_page(ic, recv); | 227 | ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); |
| 119 | if (recv->r_frag->f_page) | 228 | rds_ib_frag_free(ic, recv->r_frag); |
| 120 | rds_ib_frag_drop_page(recv->r_frag); | ||
| 121 | rds_ib_frag_free(recv->r_frag); | ||
| 122 | recv->r_frag = NULL; | 229 | recv->r_frag = NULL; |
| 123 | } | 230 | } |
| 124 | } | 231 | } |
| @@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) | |||
| 129 | 236 | ||
| 130 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) | 237 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) |
| 131 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); | 238 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); |
| 132 | |||
| 133 | if (ic->i_frag.f_page) | ||
| 134 | rds_ib_frag_drop_page(&ic->i_frag); | ||
| 135 | } | 239 | } |
| 136 | 240 | ||
| 137 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | 241 | static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, |
| 138 | struct rds_ib_recv_work *recv, | 242 | gfp_t slab_mask) |
| 139 | gfp_t kptr_gfp, gfp_t page_gfp) | ||
| 140 | { | 243 | { |
| 141 | struct rds_ib_connection *ic = conn->c_transport_data; | 244 | struct rds_ib_incoming *ibinc; |
| 142 | dma_addr_t dma_addr; | 245 | struct list_head *cache_item; |
| 143 | struct ib_sge *sge; | 246 | int avail_allocs; |
| 144 | int ret = -ENOMEM; | ||
| 145 | 247 | ||
| 146 | if (recv->r_ibinc == NULL) { | 248 | cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); |
| 147 | if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { | 249 | if (cache_item) { |
| 250 | ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); | ||
| 251 | } else { | ||
| 252 | avail_allocs = atomic_add_unless(&rds_ib_allocation, | ||
| 253 | 1, rds_ib_sysctl_max_recv_allocation); | ||
| 254 | if (!avail_allocs) { | ||
| 148 | rds_ib_stats_inc(s_ib_rx_alloc_limit); | 255 | rds_ib_stats_inc(s_ib_rx_alloc_limit); |
| 149 | goto out; | 256 | return NULL; |
| 150 | } | 257 | } |
| 151 | recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, | 258 | ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); |
| 152 | kptr_gfp); | 259 | if (!ibinc) { |
| 153 | if (recv->r_ibinc == NULL) { | ||
| 154 | atomic_dec(&rds_ib_allocation); | 260 | atomic_dec(&rds_ib_allocation); |
| 155 | goto out; | 261 | return NULL; |
| 156 | } | 262 | } |
| 157 | INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); | ||
| 158 | rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); | ||
| 159 | } | 263 | } |
| 264 | INIT_LIST_HEAD(&ibinc->ii_frags); | ||
| 265 | rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); | ||
| 160 | 266 | ||
| 161 | if (recv->r_frag == NULL) { | 267 | return ibinc; |
| 162 | recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); | 268 | } |
| 163 | if (recv->r_frag == NULL) | 269 | |
| 164 | goto out; | 270 | static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, |
| 165 | INIT_LIST_HEAD(&recv->r_frag->f_item); | 271 | gfp_t slab_mask, gfp_t page_mask) |
| 166 | recv->r_frag->f_page = NULL; | 272 | { |
| 273 | struct rds_page_frag *frag; | ||
| 274 | struct list_head *cache_item; | ||
| 275 | int ret; | ||
| 276 | |||
| 277 | cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); | ||
| 278 | if (cache_item) { | ||
| 279 | frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); | ||
| 280 | } else { | ||
| 281 | frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); | ||
| 282 | if (!frag) | ||
| 283 | return NULL; | ||
| 284 | |||
| 285 | sg_init_table(&frag->f_sg, 1); | ||
| 286 | ret = rds_page_remainder_alloc(&frag->f_sg, | ||
| 287 | RDS_FRAG_SIZE, page_mask); | ||
| 288 | if (ret) { | ||
| 289 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
| 290 | return NULL; | ||
| 291 | } | ||
| 167 | } | 292 | } |
| 168 | 293 | ||
| 169 | if (ic->i_frag.f_page == NULL) { | 294 | INIT_LIST_HEAD(&frag->f_item); |
| 170 | ic->i_frag.f_page = alloc_page(page_gfp); | 295 | |
| 171 | if (ic->i_frag.f_page == NULL) | 296 | return frag; |
| 172 | goto out; | 297 | } |
| 173 | ic->i_frag.f_offset = 0; | 298 | |
| 299 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | ||
| 300 | struct rds_ib_recv_work *recv, int prefill) | ||
| 301 | { | ||
| 302 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
| 303 | struct ib_sge *sge; | ||
| 304 | int ret = -ENOMEM; | ||
| 305 | gfp_t slab_mask = GFP_NOWAIT; | ||
| 306 | gfp_t page_mask = GFP_NOWAIT; | ||
| 307 | |||
| 308 | if (prefill) { | ||
| 309 | slab_mask = GFP_KERNEL; | ||
| 310 | page_mask = GFP_HIGHUSER; | ||
| 174 | } | 311 | } |
| 175 | 312 | ||
| 176 | dma_addr = ib_dma_map_page(ic->i_cm_id->device, | 313 | if (!ic->i_cache_incs.ready) |
| 177 | ic->i_frag.f_page, | 314 | rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); |
| 178 | ic->i_frag.f_offset, | 315 | if (!ic->i_cache_frags.ready) |
| 179 | RDS_FRAG_SIZE, | 316 | rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); |
| 180 | DMA_FROM_DEVICE); | ||
| 181 | if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) | ||
| 182 | goto out; | ||
| 183 | 317 | ||
| 184 | /* | 318 | /* |
| 185 | * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() | 319 | * ibinc was taken from recv if recv contained the start of a message. |
| 186 | * must be called on this recv. This happens as completions hit | 320 | * recvs that were continuations will still have this allocated. |
| 187 | * in order or on connection shutdown. | ||
| 188 | */ | 321 | */ |
| 189 | recv->r_frag->f_page = ic->i_frag.f_page; | 322 | if (!recv->r_ibinc) { |
| 190 | recv->r_frag->f_offset = ic->i_frag.f_offset; | 323 | recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); |
| 191 | recv->r_frag->f_mapped = dma_addr; | 324 | if (!recv->r_ibinc) |
| 325 | goto out; | ||
| 326 | } | ||
| 192 | 327 | ||
| 193 | sge = rds_ib_data_sge(ic, recv->r_sge); | 328 | WARN_ON(recv->r_frag); /* leak! */ |
| 194 | sge->addr = dma_addr; | 329 | recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); |
| 195 | sge->length = RDS_FRAG_SIZE; | 330 | if (!recv->r_frag) |
| 331 | goto out; | ||
| 332 | |||
| 333 | ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, | ||
| 334 | 1, DMA_FROM_DEVICE); | ||
| 335 | WARN_ON(ret != 1); | ||
| 196 | 336 | ||
| 197 | sge = rds_ib_header_sge(ic, recv->r_sge); | 337 | sge = &recv->r_sge[0]; |
| 198 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); | 338 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); |
| 199 | sge->length = sizeof(struct rds_header); | 339 | sge->length = sizeof(struct rds_header); |
| 200 | 340 | ||
| 201 | get_page(recv->r_frag->f_page); | 341 | sge = &recv->r_sge[1]; |
| 202 | 342 | sge->addr = sg_dma_address(&recv->r_frag->f_sg); | |
| 203 | if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { | 343 | sge->length = sg_dma_len(&recv->r_frag->f_sg); |
| 204 | ic->i_frag.f_offset += RDS_FRAG_SIZE; | ||
| 205 | } else { | ||
| 206 | put_page(ic->i_frag.f_page); | ||
| 207 | ic->i_frag.f_page = NULL; | ||
| 208 | ic->i_frag.f_offset = 0; | ||
| 209 | } | ||
| 210 | 344 | ||
| 211 | ret = 0; | 345 | ret = 0; |
| 212 | out: | 346 | out: |
| @@ -216,13 +350,11 @@ out: | |||
| 216 | /* | 350 | /* |
| 217 | * This tries to allocate and post unused work requests after making sure that | 351 | * This tries to allocate and post unused work requests after making sure that |
| 218 | * they have all the allocations they need to queue received fragments into | 352 | * they have all the allocations they need to queue received fragments into |
| 219 | * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc | 353 | * sockets. |
| 220 | * pairs don't go unmatched. | ||
| 221 | * | 354 | * |
| 222 | * -1 is returned if posting fails due to temporary resource exhaustion. | 355 | * -1 is returned if posting fails due to temporary resource exhaustion. |
| 223 | */ | 356 | */ |
| 224 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 357 | void rds_ib_recv_refill(struct rds_connection *conn, int prefill) |
| 225 | gfp_t page_gfp, int prefill) | ||
| 226 | { | 358 | { |
| 227 | struct rds_ib_connection *ic = conn->c_transport_data; | 359 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 228 | struct rds_ib_recv_work *recv; | 360 | struct rds_ib_recv_work *recv; |
| @@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
| 236 | if (pos >= ic->i_recv_ring.w_nr) { | 368 | if (pos >= ic->i_recv_ring.w_nr) { |
| 237 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", | 369 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", |
| 238 | pos); | 370 | pos); |
| 239 | ret = -EINVAL; | ||
| 240 | break; | 371 | break; |
| 241 | } | 372 | } |
| 242 | 373 | ||
| 243 | recv = &ic->i_recvs[pos]; | 374 | recv = &ic->i_recvs[pos]; |
| 244 | ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); | 375 | ret = rds_ib_recv_refill_one(conn, recv, prefill); |
| 245 | if (ret) { | 376 | if (ret) { |
| 246 | ret = -1; | ||
| 247 | break; | 377 | break; |
| 248 | } | 378 | } |
| 249 | 379 | ||
| 250 | /* XXX when can this fail? */ | 380 | /* XXX when can this fail? */ |
| 251 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); | 381 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); |
| 252 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, | 382 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, |
| 253 | recv->r_ibinc, recv->r_frag->f_page, | 383 | recv->r_ibinc, sg_page(&recv->r_frag->f_sg), |
| 254 | (long) recv->r_frag->f_mapped, ret); | 384 | (long) sg_dma_address(&recv->r_frag->f_sg), ret); |
| 255 | if (ret) { | 385 | if (ret) { |
| 256 | rds_ib_conn_error(conn, "recv post on " | 386 | rds_ib_conn_error(conn, "recv post on " |
| 257 | "%pI4 returned %d, disconnecting and " | 387 | "%pI4 returned %d, disconnecting and " |
| 258 | "reconnecting\n", &conn->c_faddr, | 388 | "reconnecting\n", &conn->c_faddr, |
| 259 | ret); | 389 | ret); |
| 260 | ret = -1; | ||
| 261 | break; | 390 | break; |
| 262 | } | 391 | } |
| 263 | 392 | ||
| @@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
| 270 | 399 | ||
| 271 | if (ret) | 400 | if (ret) |
| 272 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); | 401 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); |
| 273 | return ret; | ||
| 274 | } | 402 | } |
| 275 | 403 | ||
| 276 | void rds_ib_inc_purge(struct rds_incoming *inc) | 404 | /* |
| 405 | * We want to recycle several types of recv allocations, like incs and frags. | ||
| 406 | * To use this, the *_free() function passes in the ptr to a list_head within | ||
| 407 | * the recyclee, as well as the cache to put it on. | ||
| 408 | * | ||
| 409 | * First, we put the memory on a percpu list. When this reaches a certain size, | ||
| 410 | * We move it to an intermediate non-percpu list in a lockless manner, with some | ||
| 411 | * xchg/compxchg wizardry. | ||
| 412 | * | ||
| 413 | * N.B. Instead of a list_head as the anchor, we use a single pointer, which can | ||
| 414 | * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and | ||
| 415 | * list_empty() will return true with one element is actually present. | ||
| 416 | */ | ||
| 417 | static void rds_ib_recv_cache_put(struct list_head *new_item, | ||
| 418 | struct rds_ib_refill_cache *cache) | ||
| 277 | { | 419 | { |
| 278 | struct rds_ib_incoming *ibinc; | 420 | unsigned long flags; |
| 279 | struct rds_page_frag *frag; | 421 | struct rds_ib_cache_head *chp; |
| 280 | struct rds_page_frag *pos; | 422 | struct list_head *old; |
| 281 | 423 | ||
| 282 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | 424 | local_irq_save(flags); |
| 283 | rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); | ||
| 284 | 425 | ||
| 285 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | 426 | chp = per_cpu_ptr(cache->percpu, smp_processor_id()); |
| 286 | list_del_init(&frag->f_item); | 427 | if (!chp->first) |
| 287 | rds_ib_frag_drop_page(frag); | 428 | INIT_LIST_HEAD(new_item); |
| 288 | rds_ib_frag_free(frag); | 429 | else /* put on front */ |
| 289 | } | 430 | list_add_tail(new_item, chp->first); |
| 431 | chp->first = new_item; | ||
| 432 | chp->count++; | ||
| 433 | |||
| 434 | if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT) | ||
| 435 | goto end; | ||
| 436 | |||
| 437 | /* | ||
| 438 | * Return our per-cpu first list to the cache's xfer by atomically | ||
| 439 | * grabbing the current xfer list, appending it to our per-cpu list, | ||
| 440 | * and then atomically returning that entire list back to the | ||
| 441 | * cache's xfer list as long as it's still empty. | ||
| 442 | */ | ||
| 443 | do { | ||
| 444 | old = xchg(&cache->xfer, NULL); | ||
| 445 | if (old) | ||
| 446 | list_splice_entire_tail(old, chp->first); | ||
| 447 | old = cmpxchg(&cache->xfer, NULL, chp->first); | ||
| 448 | } while (old); | ||
| 449 | |||
| 450 | chp->first = NULL; | ||
| 451 | chp->count = 0; | ||
| 452 | end: | ||
| 453 | local_irq_restore(flags); | ||
| 290 | } | 454 | } |
| 291 | 455 | ||
| 292 | void rds_ib_inc_free(struct rds_incoming *inc) | 456 | static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) |
| 293 | { | 457 | { |
| 294 | struct rds_ib_incoming *ibinc; | 458 | struct list_head *head = cache->ready; |
| 295 | 459 | ||
| 296 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | 460 | if (head) { |
| 461 | if (!list_empty(head)) { | ||
| 462 | cache->ready = head->next; | ||
| 463 | list_del_init(head); | ||
| 464 | } else | ||
| 465 | cache->ready = NULL; | ||
| 466 | } | ||
| 297 | 467 | ||
| 298 | rds_ib_inc_purge(inc); | 468 | return head; |
| 299 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
| 300 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
| 301 | kmem_cache_free(rds_ib_incoming_slab, ibinc); | ||
| 302 | atomic_dec(&rds_ib_allocation); | ||
| 303 | BUG_ON(atomic_read(&rds_ib_allocation) < 0); | ||
| 304 | } | 469 | } |
| 305 | 470 | ||
| 306 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | 471 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, |
| @@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | |||
| 336 | to_copy = min_t(unsigned long, to_copy, len - copied); | 501 | to_copy = min_t(unsigned long, to_copy, len - copied); |
| 337 | 502 | ||
| 338 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " | 503 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " |
| 339 | "[%p, %lu] + %lu\n", | 504 | "[%p, %u] + %lu\n", |
| 340 | to_copy, iov->iov_base, iov->iov_len, iov_off, | 505 | to_copy, iov->iov_base, iov->iov_len, iov_off, |
| 341 | frag->f_page, frag->f_offset, frag_off); | 506 | sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); |
| 342 | 507 | ||
| 343 | /* XXX needs + offset for multiple recvs per page */ | 508 | /* XXX needs + offset for multiple recvs per page */ |
| 344 | ret = rds_page_copy_to_user(frag->f_page, | 509 | ret = rds_page_copy_to_user(sg_page(&frag->f_sg), |
| 345 | frag->f_offset + frag_off, | 510 | frag->f_sg.offset + frag_off, |
| 346 | iov->iov_base + iov_off, | 511 | iov->iov_base + iov_off, |
| 347 | to_copy); | 512 | to_copy); |
| 348 | if (ret) { | 513 | if (ret) { |
| @@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) | |||
| 557 | return rds_ib_get_ack(ic); | 722 | return rds_ib_get_ack(ic); |
| 558 | } | 723 | } |
| 559 | 724 | ||
| 560 | static struct rds_header *rds_ib_get_header(struct rds_connection *conn, | ||
| 561 | struct rds_ib_recv_work *recv, | ||
| 562 | u32 data_len) | ||
| 563 | { | ||
| 564 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
| 565 | void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; | ||
| 566 | void *addr; | ||
| 567 | u32 misplaced_hdr_bytes; | ||
| 568 | |||
| 569 | /* | ||
| 570 | * Support header at the front (RDS 3.1+) as well as header-at-end. | ||
| 571 | * | ||
| 572 | * Cases: | ||
| 573 | * 1) header all in header buff (great!) | ||
| 574 | * 2) header all in data page (copy all to header buff) | ||
| 575 | * 3) header split across hdr buf + data page | ||
| 576 | * (move bit in hdr buff to end before copying other bit from data page) | ||
| 577 | */ | ||
| 578 | if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) | ||
| 579 | return hdr_buff; | ||
| 580 | |||
| 581 | if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { | ||
| 582 | addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); | ||
| 583 | memcpy(hdr_buff, | ||
| 584 | addr + recv->r_frag->f_offset + data_len, | ||
| 585 | sizeof(struct rds_header)); | ||
| 586 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
| 587 | return hdr_buff; | ||
| 588 | } | ||
| 589 | |||
| 590 | misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); | ||
| 591 | |||
| 592 | memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); | ||
| 593 | |||
| 594 | addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); | ||
| 595 | memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, | ||
| 596 | sizeof(struct rds_header) - misplaced_hdr_bytes); | ||
| 597 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
| 598 | return hdr_buff; | ||
| 599 | } | ||
| 600 | |||
| 601 | /* | 725 | /* |
| 602 | * It's kind of lame that we're copying from the posted receive pages into | 726 | * It's kind of lame that we're copying from the posted receive pages into |
| 603 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into | 727 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into |
| @@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, | |||
| 639 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); | 763 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); |
| 640 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ | 764 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ |
| 641 | 765 | ||
| 642 | addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); | 766 | addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0); |
| 643 | 767 | ||
| 644 | src = addr + frag_off; | 768 | src = addr + frag_off; |
| 645 | dst = (void *)map->m_page_addrs[map_page] + map_off; | 769 | dst = (void *)map->m_page_addrs[map_page] + map_off; |
| @@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
| 710 | } | 834 | } |
| 711 | data_len -= sizeof(struct rds_header); | 835 | data_len -= sizeof(struct rds_header); |
| 712 | 836 | ||
| 713 | ihdr = rds_ib_get_header(conn, recv, data_len); | 837 | ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; |
| 714 | 838 | ||
| 715 | /* Validate the checksum. */ | 839 | /* Validate the checksum. */ |
| 716 | if (!rds_message_verify_checksum(ihdr)) { | 840 | if (!rds_message_verify_checksum(ihdr)) { |
| @@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
| 742 | * the inc is freed. We don't go that route, so we have to drop the | 866 | * the inc is freed. We don't go that route, so we have to drop the |
| 743 | * page ref ourselves. We can't just leave the page on the recv | 867 | * page ref ourselves. We can't just leave the page on the recv |
| 744 | * because that confuses the dma mapping of pages and each recv's use | 868 | * because that confuses the dma mapping of pages and each recv's use |
| 745 | * of a partial page. We can leave the frag, though, it will be | 869 | * of a partial page. |
| 746 | * reused. | ||
| 747 | * | 870 | * |
| 748 | * FIXME: Fold this into the code path below. | 871 | * FIXME: Fold this into the code path below. |
| 749 | */ | 872 | */ |
| 750 | rds_ib_frag_drop_page(recv->r_frag); | 873 | rds_ib_frag_free(ic, recv->r_frag); |
| 874 | recv->r_frag = NULL; | ||
| 751 | return; | 875 | return; |
| 752 | } | 876 | } |
| 753 | 877 | ||
| @@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
| 757 | * into the inc and save the inc so we can hang upcoming fragments | 881 | * into the inc and save the inc so we can hang upcoming fragments |
| 758 | * off its list. | 882 | * off its list. |
| 759 | */ | 883 | */ |
| 760 | if (ibinc == NULL) { | 884 | if (!ibinc) { |
| 761 | ibinc = recv->r_ibinc; | 885 | ibinc = recv->r_ibinc; |
| 762 | recv->r_ibinc = NULL; | 886 | recv->r_ibinc = NULL; |
| 763 | ic->i_ibinc = ibinc; | 887 | ic->i_ibinc = ibinc; |
| @@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, | |||
| 842 | struct rds_ib_recv_work *recv; | 966 | struct rds_ib_recv_work *recv; |
| 843 | 967 | ||
| 844 | while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { | 968 | while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { |
| 845 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 969 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
| 846 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 970 | (unsigned long long)wc.wr_id, wc.status, |
| 971 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
| 847 | be32_to_cpu(wc.ex.imm_data)); | 972 | be32_to_cpu(wc.ex.imm_data)); |
| 848 | rds_ib_stats_inc(s_ib_rx_cq_event); | 973 | rds_ib_stats_inc(s_ib_rx_cq_event); |
| 849 | 974 | ||
| 850 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; | 975 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; |
| 851 | 976 | ||
| 852 | rds_ib_recv_unmap_page(ic, recv); | 977 | ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); |
| 853 | 978 | ||
| 854 | /* | 979 | /* |
| 855 | * Also process recvs in connecting state because it is possible | 980 | * Also process recvs in connecting state because it is possible |
| 856 | * to get a recv completion _before_ the rdmacm ESTABLISHED | 981 | * to get a recv completion _before_ the rdmacm ESTABLISHED |
| 857 | * event is processed. | 982 | * event is processed. |
| 858 | */ | 983 | */ |
| 859 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) { | 984 | if (wc.status == IB_WC_SUCCESS) { |
| 985 | rds_ib_process_recv(conn, recv, wc.byte_len, state); | ||
| 986 | } else { | ||
| 860 | /* We expect errors as the qp is drained during shutdown */ | 987 | /* We expect errors as the qp is drained during shutdown */ |
| 861 | if (wc.status == IB_WC_SUCCESS) { | 988 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) |
| 862 | rds_ib_process_recv(conn, recv, wc.byte_len, state); | 989 | rds_ib_conn_error(conn, "recv completion on %pI4 had " |
| 863 | } else { | 990 | "status %u (%s), disconnecting and " |
| 864 | rds_ib_conn_error(conn, "recv completion on " | 991 | "reconnecting\n", &conn->c_faddr, |
| 865 | "%pI4 had status %u, disconnecting and " | 992 | wc.status, |
| 866 | "reconnecting\n", &conn->c_faddr, | 993 | rds_ib_wc_status_str(wc.status)); |
| 867 | wc.status); | ||
| 868 | } | ||
| 869 | } | 994 | } |
| 870 | 995 | ||
| 996 | /* | ||
| 997 | * It's very important that we only free this ring entry if we've truly | ||
| 998 | * freed the resources allocated to the entry. The refilling path can | ||
| 999 | * leak if we don't. | ||
| 1000 | */ | ||
| 871 | rds_ib_ring_free(&ic->i_recv_ring, 1); | 1001 | rds_ib_ring_free(&ic->i_recv_ring, 1); |
| 872 | } | 1002 | } |
| 873 | } | 1003 | } |
| @@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) | |||
| 897 | if (rds_ib_ring_empty(&ic->i_recv_ring)) | 1027 | if (rds_ib_ring_empty(&ic->i_recv_ring)) |
| 898 | rds_ib_stats_inc(s_ib_rx_ring_empty); | 1028 | rds_ib_stats_inc(s_ib_rx_ring_empty); |
| 899 | 1029 | ||
| 900 | /* | ||
| 901 | * If the ring is running low, then schedule the thread to refill. | ||
| 902 | */ | ||
| 903 | if (rds_ib_ring_low(&ic->i_recv_ring)) | 1030 | if (rds_ib_ring_low(&ic->i_recv_ring)) |
| 904 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | 1031 | rds_ib_recv_refill(conn, 0); |
| 905 | } | 1032 | } |
| 906 | 1033 | ||
| 907 | int rds_ib_recv(struct rds_connection *conn) | 1034 | int rds_ib_recv(struct rds_connection *conn) |
| @@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn) | |||
| 910 | int ret = 0; | 1037 | int ret = 0; |
| 911 | 1038 | ||
| 912 | rdsdebug("conn %p\n", conn); | 1039 | rdsdebug("conn %p\n", conn); |
| 913 | |||
| 914 | /* | ||
| 915 | * If we get a temporary posting failure in this context then | ||
| 916 | * we're really low and we want the caller to back off for a bit. | ||
| 917 | */ | ||
| 918 | mutex_lock(&ic->i_recv_mutex); | ||
| 919 | if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) | ||
| 920 | ret = -ENOMEM; | ||
| 921 | else | ||
| 922 | rds_ib_stats_inc(s_ib_rx_refill_from_thread); | ||
| 923 | mutex_unlock(&ic->i_recv_mutex); | ||
| 924 | |||
| 925 | if (rds_conn_up(conn)) | 1040 | if (rds_conn_up(conn)) |
| 926 | rds_ib_attempt_ack(ic); | 1041 | rds_ib_attempt_ack(ic); |
| 927 | 1042 | ||
| 928 | return ret; | 1043 | return ret; |
| 929 | } | 1044 | } |
| 930 | 1045 | ||
| 931 | int __init rds_ib_recv_init(void) | 1046 | int rds_ib_recv_init(void) |
| 932 | { | 1047 | { |
| 933 | struct sysinfo si; | 1048 | struct sysinfo si; |
| 934 | int ret = -ENOMEM; | 1049 | int ret = -ENOMEM; |
| @@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void) | |||
| 939 | 1054 | ||
| 940 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", | 1055 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", |
| 941 | sizeof(struct rds_ib_incoming), | 1056 | sizeof(struct rds_ib_incoming), |
| 942 | 0, 0, NULL); | 1057 | 0, SLAB_HWCACHE_ALIGN, NULL); |
| 943 | if (rds_ib_incoming_slab == NULL) | 1058 | if (!rds_ib_incoming_slab) |
| 944 | goto out; | 1059 | goto out; |
| 945 | 1060 | ||
| 946 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", | 1061 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", |
| 947 | sizeof(struct rds_page_frag), | 1062 | sizeof(struct rds_page_frag), |
| 948 | 0, 0, NULL); | 1063 | 0, SLAB_HWCACHE_ALIGN, NULL); |
| 949 | if (rds_ib_frag_slab == NULL) | 1064 | if (!rds_ib_frag_slab) |
| 950 | kmem_cache_destroy(rds_ib_incoming_slab); | 1065 | kmem_cache_destroy(rds_ib_incoming_slab); |
| 951 | else | 1066 | else |
| 952 | ret = 0; | 1067 | ret = 0; |
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 17fa80803ab0..71f373c421bc 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c | |||
| @@ -36,11 +36,49 @@ | |||
| 36 | #include <linux/dmapool.h> | 36 | #include <linux/dmapool.h> |
| 37 | 37 | ||
| 38 | #include "rds.h" | 38 | #include "rds.h" |
| 39 | #include "rdma.h" | ||
| 40 | #include "ib.h" | 39 | #include "ib.h" |
| 41 | 40 | ||
| 42 | static void rds_ib_send_rdma_complete(struct rds_message *rm, | 41 | static char *rds_ib_wc_status_strings[] = { |
| 43 | int wc_status) | 42 | #define RDS_IB_WC_STATUS_STR(foo) \ |
| 43 | [IB_WC_##foo] = __stringify(IB_WC_##foo) | ||
| 44 | RDS_IB_WC_STATUS_STR(SUCCESS), | ||
| 45 | RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), | ||
| 46 | RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), | ||
| 47 | RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), | ||
| 48 | RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), | ||
| 49 | RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), | ||
| 50 | RDS_IB_WC_STATUS_STR(MW_BIND_ERR), | ||
| 51 | RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), | ||
| 52 | RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), | ||
| 53 | RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), | ||
| 54 | RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), | ||
| 55 | RDS_IB_WC_STATUS_STR(REM_OP_ERR), | ||
| 56 | RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), | ||
| 57 | RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), | ||
| 58 | RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), | ||
| 59 | RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), | ||
| 60 | RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), | ||
| 61 | RDS_IB_WC_STATUS_STR(INV_EECN_ERR), | ||
| 62 | RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), | ||
| 63 | RDS_IB_WC_STATUS_STR(FATAL_ERR), | ||
| 64 | RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), | ||
| 65 | RDS_IB_WC_STATUS_STR(GENERAL_ERR), | ||
| 66 | #undef RDS_IB_WC_STATUS_STR | ||
| 67 | }; | ||
| 68 | |||
| 69 | char *rds_ib_wc_status_str(enum ib_wc_status status) | ||
| 70 | { | ||
| 71 | return rds_str_array(rds_ib_wc_status_strings, | ||
| 72 | ARRAY_SIZE(rds_ib_wc_status_strings), status); | ||
| 73 | } | ||
| 74 | |||
| 75 | /* | ||
| 76 | * Convert IB-specific error message to RDS error message and call core | ||
| 77 | * completion handler. | ||
| 78 | */ | ||
| 79 | static void rds_ib_send_complete(struct rds_message *rm, | ||
| 80 | int wc_status, | ||
| 81 | void (*complete)(struct rds_message *rm, int status)) | ||
| 44 | { | 82 | { |
| 45 | int notify_status; | 83 | int notify_status; |
| 46 | 84 | ||
| @@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, | |||
| 60 | notify_status = RDS_RDMA_OTHER_ERROR; | 98 | notify_status = RDS_RDMA_OTHER_ERROR; |
| 61 | break; | 99 | break; |
| 62 | } | 100 | } |
| 63 | rds_rdma_send_complete(rm, notify_status); | 101 | complete(rm, notify_status); |
| 102 | } | ||
| 103 | |||
| 104 | static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, | ||
| 105 | struct rm_data_op *op, | ||
| 106 | int wc_status) | ||
| 107 | { | ||
| 108 | if (op->op_nents) | ||
| 109 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
| 110 | op->op_sg, op->op_nents, | ||
| 111 | DMA_TO_DEVICE); | ||
| 64 | } | 112 | } |
| 65 | 113 | ||
| 66 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, | 114 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, |
| 67 | struct rds_rdma_op *op) | 115 | struct rm_rdma_op *op, |
| 116 | int wc_status) | ||
| 68 | { | 117 | { |
| 69 | if (op->r_mapped) { | 118 | if (op->op_mapped) { |
| 70 | ib_dma_unmap_sg(ic->i_cm_id->device, | 119 | ib_dma_unmap_sg(ic->i_cm_id->device, |
| 71 | op->r_sg, op->r_nents, | 120 | op->op_sg, op->op_nents, |
| 72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | 121 | op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
| 73 | op->r_mapped = 0; | 122 | op->op_mapped = 0; |
| 74 | } | 123 | } |
| 124 | |||
| 125 | /* If the user asked for a completion notification on this | ||
| 126 | * message, we can implement three different semantics: | ||
| 127 | * 1. Notify when we received the ACK on the RDS message | ||
| 128 | * that was queued with the RDMA. This provides reliable | ||
| 129 | * notification of RDMA status at the expense of a one-way | ||
| 130 | * packet delay. | ||
| 131 | * 2. Notify when the IB stack gives us the completion event for | ||
| 132 | * the RDMA operation. | ||
| 133 | * 3. Notify when the IB stack gives us the completion event for | ||
| 134 | * the accompanying RDS messages. | ||
| 135 | * Here, we implement approach #3. To implement approach #2, | ||
| 136 | * we would need to take an event for the rdma WR. To implement #1, | ||
| 137 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
| 138 | * handling in the ACK processing code. | ||
| 139 | * | ||
| 140 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
| 141 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
| 142 | * operation itself unmapped the RDMA buffers, which takes care | ||
| 143 | * of synching. | ||
| 144 | */ | ||
| 145 | rds_ib_send_complete(container_of(op, struct rds_message, rdma), | ||
| 146 | wc_status, rds_rdma_send_complete); | ||
| 147 | |||
| 148 | if (op->op_write) | ||
| 149 | rds_stats_add(s_send_rdma_bytes, op->op_bytes); | ||
| 150 | else | ||
| 151 | rds_stats_add(s_recv_rdma_bytes, op->op_bytes); | ||
| 75 | } | 152 | } |
| 76 | 153 | ||
| 77 | static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, | 154 | static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, |
| 78 | struct rds_ib_send_work *send, | 155 | struct rm_atomic_op *op, |
| 79 | int wc_status) | 156 | int wc_status) |
| 80 | { | 157 | { |
| 81 | struct rds_message *rm = send->s_rm; | 158 | /* unmap atomic recvbuf */ |
| 82 | 159 | if (op->op_mapped) { | |
| 83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | 160 | ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, |
| 84 | 161 | DMA_FROM_DEVICE); | |
| 85 | ib_dma_unmap_sg(ic->i_cm_id->device, | 162 | op->op_mapped = 0; |
| 86 | rm->m_sg, rm->m_nents, | 163 | } |
| 87 | DMA_TO_DEVICE); | ||
| 88 | |||
| 89 | if (rm->m_rdma_op != NULL) { | ||
| 90 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
| 91 | |||
| 92 | /* If the user asked for a completion notification on this | ||
| 93 | * message, we can implement three different semantics: | ||
| 94 | * 1. Notify when we received the ACK on the RDS message | ||
| 95 | * that was queued with the RDMA. This provides reliable | ||
| 96 | * notification of RDMA status at the expense of a one-way | ||
| 97 | * packet delay. | ||
| 98 | * 2. Notify when the IB stack gives us the completion event for | ||
| 99 | * the RDMA operation. | ||
| 100 | * 3. Notify when the IB stack gives us the completion event for | ||
| 101 | * the accompanying RDS messages. | ||
| 102 | * Here, we implement approach #3. To implement approach #2, | ||
| 103 | * call rds_rdma_send_complete from the cq_handler. To implement #1, | ||
| 104 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
| 105 | * handling in the ACK processing code. | ||
| 106 | * | ||
| 107 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
| 108 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
| 109 | * operation itself unmapped the RDMA buffers, which takes care | ||
| 110 | * of synching. | ||
| 111 | */ | ||
| 112 | rds_ib_send_rdma_complete(rm, wc_status); | ||
| 113 | 164 | ||
| 114 | if (rm->m_rdma_op->r_write) | 165 | rds_ib_send_complete(container_of(op, struct rds_message, atomic), |
| 115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | 166 | wc_status, rds_atomic_send_complete); |
| 116 | else | 167 | |
| 117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | 168 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) |
| 169 | rds_ib_stats_inc(s_ib_atomic_cswp); | ||
| 170 | else | ||
| 171 | rds_ib_stats_inc(s_ib_atomic_fadd); | ||
| 172 | } | ||
| 173 | |||
| 174 | /* | ||
| 175 | * Unmap the resources associated with a struct send_work. | ||
| 176 | * | ||
| 177 | * Returns the rm for no good reason other than it is unobtainable | ||
| 178 | * other than by switching on wr.opcode, currently, and the caller, | ||
| 179 | * the event handler, needs it. | ||
| 180 | */ | ||
| 181 | static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, | ||
| 182 | struct rds_ib_send_work *send, | ||
| 183 | int wc_status) | ||
| 184 | { | ||
| 185 | struct rds_message *rm = NULL; | ||
| 186 | |||
| 187 | /* In the error case, wc.opcode sometimes contains garbage */ | ||
| 188 | switch (send->s_wr.opcode) { | ||
| 189 | case IB_WR_SEND: | ||
| 190 | if (send->s_op) { | ||
| 191 | rm = container_of(send->s_op, struct rds_message, data); | ||
| 192 | rds_ib_send_unmap_data(ic, send->s_op, wc_status); | ||
| 193 | } | ||
| 194 | break; | ||
| 195 | case IB_WR_RDMA_WRITE: | ||
| 196 | case IB_WR_RDMA_READ: | ||
| 197 | if (send->s_op) { | ||
| 198 | rm = container_of(send->s_op, struct rds_message, rdma); | ||
| 199 | rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); | ||
| 200 | } | ||
| 201 | break; | ||
| 202 | case IB_WR_ATOMIC_FETCH_AND_ADD: | ||
| 203 | case IB_WR_ATOMIC_CMP_AND_SWP: | ||
| 204 | if (send->s_op) { | ||
| 205 | rm = container_of(send->s_op, struct rds_message, atomic); | ||
| 206 | rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); | ||
| 207 | } | ||
| 208 | break; | ||
| 209 | default: | ||
| 210 | if (printk_ratelimit()) | ||
| 211 | printk(KERN_NOTICE | ||
| 212 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
| 213 | __func__, send->s_wr.opcode); | ||
| 214 | break; | ||
| 118 | } | 215 | } |
| 119 | 216 | ||
| 120 | /* If anyone waited for this message to get flushed out, wake | 217 | send->s_wr.opcode = 0xdead; |
| 121 | * them up now */ | ||
| 122 | rds_message_unmapped(rm); | ||
| 123 | 218 | ||
| 124 | rds_message_put(rm); | 219 | return rm; |
| 125 | send->s_rm = NULL; | ||
| 126 | } | 220 | } |
| 127 | 221 | ||
| 128 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) | 222 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) |
| @@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) | |||
| 133 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 227 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
| 134 | struct ib_sge *sge; | 228 | struct ib_sge *sge; |
| 135 | 229 | ||
| 136 | send->s_rm = NULL; | ||
| 137 | send->s_op = NULL; | 230 | send->s_op = NULL; |
| 138 | 231 | ||
| 139 | send->s_wr.wr_id = i; | 232 | send->s_wr.wr_id = i; |
| 140 | send->s_wr.sg_list = send->s_sge; | 233 | send->s_wr.sg_list = send->s_sge; |
| 141 | send->s_wr.num_sge = 1; | ||
| 142 | send->s_wr.opcode = IB_WR_SEND; | ||
| 143 | send->s_wr.send_flags = 0; | ||
| 144 | send->s_wr.ex.imm_data = 0; | 234 | send->s_wr.ex.imm_data = 0; |
| 145 | 235 | ||
| 146 | sge = rds_ib_data_sge(ic, send->s_sge); | 236 | sge = &send->s_sge[0]; |
| 147 | sge->lkey = ic->i_mr->lkey; | ||
| 148 | |||
| 149 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
| 150 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); | 237 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); |
| 151 | sge->length = sizeof(struct rds_header); | 238 | sge->length = sizeof(struct rds_header); |
| 152 | sge->lkey = ic->i_mr->lkey; | 239 | sge->lkey = ic->i_mr->lkey; |
| 240 | |||
| 241 | send->s_sge[1].lkey = ic->i_mr->lkey; | ||
| 153 | } | 242 | } |
| 154 | } | 243 | } |
| 155 | 244 | ||
| @@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) | |||
| 159 | u32 i; | 248 | u32 i; |
| 160 | 249 | ||
| 161 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 250 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
| 162 | if (send->s_wr.opcode == 0xdead) | 251 | if (send->s_op && send->s_wr.opcode != 0xdead) |
| 163 | continue; | 252 | rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); |
| 164 | if (send->s_rm) | ||
| 165 | rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); | ||
| 166 | if (send->s_op) | ||
| 167 | rds_ib_send_unmap_rdma(ic, send->s_op); | ||
| 168 | } | 253 | } |
| 169 | } | 254 | } |
| 170 | 255 | ||
| 171 | /* | 256 | /* |
| 257 | * The only fast path caller always has a non-zero nr, so we don't | ||
| 258 | * bother testing nr before performing the atomic sub. | ||
| 259 | */ | ||
| 260 | static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) | ||
| 261 | { | ||
| 262 | if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && | ||
| 263 | waitqueue_active(&rds_ib_ring_empty_wait)) | ||
| 264 | wake_up(&rds_ib_ring_empty_wait); | ||
| 265 | BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); | ||
| 266 | } | ||
| 267 | |||
| 268 | /* | ||
| 172 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc | 269 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc |
| 173 | * operations performed in the send path. As the sender allocs and potentially | 270 | * operations performed in the send path. As the sender allocs and potentially |
| 174 | * unallocs the next free entry in the ring it doesn't alter which is | 271 | * unallocs the next free entry in the ring it doesn't alter which is |
| @@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 178 | { | 275 | { |
| 179 | struct rds_connection *conn = context; | 276 | struct rds_connection *conn = context; |
| 180 | struct rds_ib_connection *ic = conn->c_transport_data; | 277 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 278 | struct rds_message *rm = NULL; | ||
| 181 | struct ib_wc wc; | 279 | struct ib_wc wc; |
| 182 | struct rds_ib_send_work *send; | 280 | struct rds_ib_send_work *send; |
| 183 | u32 completed; | 281 | u32 completed; |
| 184 | u32 oldest; | 282 | u32 oldest; |
| 185 | u32 i = 0; | 283 | u32 i = 0; |
| 186 | int ret; | 284 | int ret; |
| 285 | int nr_sig = 0; | ||
| 187 | 286 | ||
| 188 | rdsdebug("cq %p conn %p\n", cq, conn); | 287 | rdsdebug("cq %p conn %p\n", cq, conn); |
| 189 | rds_ib_stats_inc(s_ib_tx_cq_call); | 288 | rds_ib_stats_inc(s_ib_tx_cq_call); |
| @@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 192 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | 291 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); |
| 193 | 292 | ||
| 194 | while (ib_poll_cq(cq, 1, &wc) > 0) { | 293 | while (ib_poll_cq(cq, 1, &wc) > 0) { |
| 195 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 294 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
| 196 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 295 | (unsigned long long)wc.wr_id, wc.status, |
| 296 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
| 197 | be32_to_cpu(wc.ex.imm_data)); | 297 | be32_to_cpu(wc.ex.imm_data)); |
| 198 | rds_ib_stats_inc(s_ib_tx_cq_event); | 298 | rds_ib_stats_inc(s_ib_tx_cq_event); |
| 199 | 299 | ||
| @@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 210 | 310 | ||
| 211 | for (i = 0; i < completed; i++) { | 311 | for (i = 0; i < completed; i++) { |
| 212 | send = &ic->i_sends[oldest]; | 312 | send = &ic->i_sends[oldest]; |
| 313 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
| 314 | nr_sig++; | ||
| 213 | 315 | ||
| 214 | /* In the error case, wc.opcode sometimes contains garbage */ | 316 | rm = rds_ib_send_unmap_op(ic, send, wc.status); |
| 215 | switch (send->s_wr.opcode) { | ||
| 216 | case IB_WR_SEND: | ||
| 217 | if (send->s_rm) | ||
| 218 | rds_ib_send_unmap_rm(ic, send, wc.status); | ||
| 219 | break; | ||
| 220 | case IB_WR_RDMA_WRITE: | ||
| 221 | case IB_WR_RDMA_READ: | ||
| 222 | /* Nothing to be done - the SG list will be unmapped | ||
| 223 | * when the SEND completes. */ | ||
| 224 | break; | ||
| 225 | default: | ||
| 226 | if (printk_ratelimit()) | ||
| 227 | printk(KERN_NOTICE | ||
| 228 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
| 229 | __func__, send->s_wr.opcode); | ||
| 230 | break; | ||
| 231 | } | ||
| 232 | 317 | ||
| 233 | send->s_wr.opcode = 0xdead; | ||
| 234 | send->s_wr.num_sge = 1; | ||
| 235 | if (send->s_queued + HZ/2 < jiffies) | 318 | if (send->s_queued + HZ/2 < jiffies) |
| 236 | rds_ib_stats_inc(s_ib_tx_stalled); | 319 | rds_ib_stats_inc(s_ib_tx_stalled); |
| 237 | 320 | ||
| 238 | /* If a RDMA operation produced an error, signal this right | 321 | if (send->s_op) { |
| 239 | * away. If we don't, the subsequent SEND that goes with this | 322 | if (send->s_op == rm->m_final_op) { |
| 240 | * RDMA will be canceled with ERR_WFLUSH, and the application | 323 | /* If anyone waited for this message to get flushed out, wake |
| 241 | * never learn that the RDMA failed. */ | 324 | * them up now */ |
| 242 | if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { | 325 | rds_message_unmapped(rm); |
| 243 | struct rds_message *rm; | ||
| 244 | |||
| 245 | rm = rds_send_get_message(conn, send->s_op); | ||
| 246 | if (rm) { | ||
| 247 | if (rm->m_rdma_op) | ||
| 248 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
| 249 | rds_ib_send_rdma_complete(rm, wc.status); | ||
| 250 | rds_message_put(rm); | ||
| 251 | } | 326 | } |
| 327 | rds_message_put(rm); | ||
| 328 | send->s_op = NULL; | ||
| 252 | } | 329 | } |
| 253 | 330 | ||
| 254 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; | 331 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; |
| 255 | } | 332 | } |
| 256 | 333 | ||
| 257 | rds_ib_ring_free(&ic->i_send_ring, completed); | 334 | rds_ib_ring_free(&ic->i_send_ring, completed); |
| 335 | rds_ib_sub_signaled(ic, nr_sig); | ||
| 336 | nr_sig = 0; | ||
| 258 | 337 | ||
| 259 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || | 338 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || |
| 260 | test_bit(0, &conn->c_map_queued)) | 339 | test_bit(0, &conn->c_map_queued)) |
| @@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 262 | 341 | ||
| 263 | /* We expect errors as the qp is drained during shutdown */ | 342 | /* We expect errors as the qp is drained during shutdown */ |
| 264 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { | 343 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { |
| 265 | rds_ib_conn_error(conn, | 344 | rds_ib_conn_error(conn, "send completion on %pI4 had status " |
| 266 | "send completion on %pI4 " | 345 | "%u (%s), disconnecting and reconnecting\n", |
| 267 | "had status %u, disconnecting and reconnecting\n", | 346 | &conn->c_faddr, wc.status, |
| 268 | &conn->c_faddr, wc.status); | 347 | rds_ib_wc_status_str(wc.status)); |
| 269 | } | 348 | } |
| 270 | } | 349 | } |
| 271 | } | 350 | } |
| @@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 294 | * credits (see rds_ib_send_add_credits below). | 373 | * credits (see rds_ib_send_add_credits below). |
| 295 | * | 374 | * |
| 296 | * The RDS send code is essentially single-threaded; rds_send_xmit | 375 | * The RDS send code is essentially single-threaded; rds_send_xmit |
| 297 | * grabs c_send_lock to ensure exclusive access to the send ring. | 376 | * sets RDS_IN_XMIT to ensure exclusive access to the send ring. |
| 298 | * However, the ACK sending code is independent and can race with | 377 | * However, the ACK sending code is independent and can race with |
| 299 | * message SENDs. | 378 | * message SENDs. |
| 300 | * | 379 | * |
| @@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) | |||
| 413 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | 492 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); |
| 414 | } | 493 | } |
| 415 | 494 | ||
| 416 | static inline void | 495 | static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, |
| 417 | rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, | 496 | struct rds_ib_send_work *send, |
| 418 | struct rds_ib_send_work *send, unsigned int pos, | 497 | bool notify) |
| 419 | unsigned long buffer, unsigned int length, | ||
| 420 | int send_flags) | ||
| 421 | { | 498 | { |
| 422 | struct ib_sge *sge; | 499 | /* |
| 423 | 500 | * We want to delay signaling completions just enough to get | |
| 424 | WARN_ON(pos != send - ic->i_sends); | 501 | * the batching benefits but not so much that we create dead time |
| 425 | 502 | * on the wire. | |
| 426 | send->s_wr.send_flags = send_flags; | 503 | */ |
| 427 | send->s_wr.opcode = IB_WR_SEND; | 504 | if (ic->i_unsignaled_wrs-- == 0 || notify) { |
| 428 | send->s_wr.num_sge = 2; | 505 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; |
| 429 | send->s_wr.next = NULL; | 506 | send->s_wr.send_flags |= IB_SEND_SIGNALED; |
| 430 | send->s_queued = jiffies; | 507 | return 1; |
| 431 | send->s_op = NULL; | ||
| 432 | |||
| 433 | if (length != 0) { | ||
| 434 | sge = rds_ib_data_sge(ic, send->s_sge); | ||
| 435 | sge->addr = buffer; | ||
| 436 | sge->length = length; | ||
| 437 | sge->lkey = ic->i_mr->lkey; | ||
| 438 | |||
| 439 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
| 440 | } else { | ||
| 441 | /* We're sending a packet with no payload. There is only | ||
| 442 | * one SGE */ | ||
| 443 | send->s_wr.num_sge = 1; | ||
| 444 | sge = &send->s_sge[0]; | ||
| 445 | } | 508 | } |
| 446 | 509 | return 0; | |
| 447 | sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); | ||
| 448 | sge->length = sizeof(struct rds_header); | ||
| 449 | sge->lkey = ic->i_mr->lkey; | ||
| 450 | } | 510 | } |
| 451 | 511 | ||
| 452 | /* | 512 | /* |
| @@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 475 | u32 pos; | 535 | u32 pos; |
| 476 | u32 i; | 536 | u32 i; |
| 477 | u32 work_alloc; | 537 | u32 work_alloc; |
| 478 | u32 credit_alloc; | 538 | u32 credit_alloc = 0; |
| 479 | u32 posted; | 539 | u32 posted; |
| 480 | u32 adv_credits = 0; | 540 | u32 adv_credits = 0; |
| 481 | int send_flags = 0; | 541 | int send_flags = 0; |
| 482 | int sent; | 542 | int bytes_sent = 0; |
| 483 | int ret; | 543 | int ret; |
| 484 | int flow_controlled = 0; | 544 | int flow_controlled = 0; |
| 545 | int nr_sig = 0; | ||
| 485 | 546 | ||
| 486 | BUG_ON(off % RDS_FRAG_SIZE); | 547 | BUG_ON(off % RDS_FRAG_SIZE); |
| 487 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); | 548 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); |
| @@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 507 | goto out; | 568 | goto out; |
| 508 | } | 569 | } |
| 509 | 570 | ||
| 510 | credit_alloc = work_alloc; | ||
| 511 | if (ic->i_flowctl) { | 571 | if (ic->i_flowctl) { |
| 512 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); | 572 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); |
| 513 | adv_credits += posted; | 573 | adv_credits += posted; |
| 514 | if (credit_alloc < work_alloc) { | 574 | if (credit_alloc < work_alloc) { |
| 515 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); | 575 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); |
| 516 | work_alloc = credit_alloc; | 576 | work_alloc = credit_alloc; |
| 517 | flow_controlled++; | 577 | flow_controlled = 1; |
| 518 | } | 578 | } |
| 519 | if (work_alloc == 0) { | 579 | if (work_alloc == 0) { |
| 520 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | 580 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); |
| @@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 525 | } | 585 | } |
| 526 | 586 | ||
| 527 | /* map the message the first time we see it */ | 587 | /* map the message the first time we see it */ |
| 528 | if (ic->i_rm == NULL) { | 588 | if (!ic->i_data_op) { |
| 529 | /* | 589 | if (rm->data.op_nents) { |
| 530 | printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", | 590 | rm->data.op_count = ib_dma_map_sg(dev, |
| 531 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | 591 | rm->data.op_sg, |
| 532 | rm->m_inc.i_hdr.h_flags, | 592 | rm->data.op_nents, |
| 533 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | 593 | DMA_TO_DEVICE); |
| 534 | */ | 594 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); |
| 535 | if (rm->m_nents) { | 595 | if (rm->data.op_count == 0) { |
| 536 | rm->m_count = ib_dma_map_sg(dev, | ||
| 537 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | ||
| 538 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | ||
| 539 | if (rm->m_count == 0) { | ||
| 540 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 596 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
| 541 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 597 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 542 | ret = -ENOMEM; /* XXX ? */ | 598 | ret = -ENOMEM; /* XXX ? */ |
| 543 | goto out; | 599 | goto out; |
| 544 | } | 600 | } |
| 545 | } else { | 601 | } else { |
| 546 | rm->m_count = 0; | 602 | rm->data.op_count = 0; |
| 547 | } | 603 | } |
| 548 | 604 | ||
| 549 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
| 550 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | ||
| 551 | rds_message_addref(rm); | 605 | rds_message_addref(rm); |
| 552 | ic->i_rm = rm; | 606 | ic->i_data_op = &rm->data; |
| 553 | 607 | ||
| 554 | /* Finalize the header */ | 608 | /* Finalize the header */ |
| 555 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) | 609 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) |
| @@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 559 | 613 | ||
| 560 | /* If it has a RDMA op, tell the peer we did it. This is | 614 | /* If it has a RDMA op, tell the peer we did it. This is |
| 561 | * used by the peer to release use-once RDMA MRs. */ | 615 | * used by the peer to release use-once RDMA MRs. */ |
| 562 | if (rm->m_rdma_op) { | 616 | if (rm->rdma.op_active) { |
| 563 | struct rds_ext_header_rdma ext_hdr; | 617 | struct rds_ext_header_rdma ext_hdr; |
| 564 | 618 | ||
| 565 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | 619 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); |
| 566 | rds_message_add_extension(&rm->m_inc.i_hdr, | 620 | rds_message_add_extension(&rm->m_inc.i_hdr, |
| 567 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | 621 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); |
| 568 | } | 622 | } |
| @@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 582 | /* | 636 | /* |
| 583 | * Update adv_credits since we reset the ACK_REQUIRED bit. | 637 | * Update adv_credits since we reset the ACK_REQUIRED bit. |
| 584 | */ | 638 | */ |
| 585 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); | 639 | if (ic->i_flowctl) { |
| 586 | adv_credits += posted; | 640 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); |
| 587 | BUG_ON(adv_credits > 255); | 641 | adv_credits += posted; |
| 642 | BUG_ON(adv_credits > 255); | ||
| 643 | } | ||
| 588 | } | 644 | } |
| 589 | 645 | ||
| 590 | send = &ic->i_sends[pos]; | ||
| 591 | first = send; | ||
| 592 | prev = NULL; | ||
| 593 | scat = &rm->m_sg[sg]; | ||
| 594 | sent = 0; | ||
| 595 | i = 0; | ||
| 596 | |||
| 597 | /* Sometimes you want to put a fence between an RDMA | 646 | /* Sometimes you want to put a fence between an RDMA |
| 598 | * READ and the following SEND. | 647 | * READ and the following SEND. |
| 599 | * We could either do this all the time | 648 | * We could either do this all the time |
| 600 | * or when requested by the user. Right now, we let | 649 | * or when requested by the user. Right now, we let |
| 601 | * the application choose. | 650 | * the application choose. |
| 602 | */ | 651 | */ |
| 603 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | 652 | if (rm->rdma.op_active && rm->rdma.op_fence) |
| 604 | send_flags = IB_SEND_FENCE; | 653 | send_flags = IB_SEND_FENCE; |
| 605 | 654 | ||
| 606 | /* | 655 | /* Each frag gets a header. Msgs may be 0 bytes */ |
| 607 | * We could be copying the header into the unused tail of the page. | 656 | send = &ic->i_sends[pos]; |
| 608 | * That would need to be changed in the future when those pages might | 657 | first = send; |
| 609 | * be mapped userspace pages or page cache pages. So instead we always | 658 | prev = NULL; |
| 610 | * use a second sge and our long-lived ring of mapped headers. We send | 659 | scat = &ic->i_data_op->op_sg[sg]; |
| 611 | * the header after the data so that the data payload can be aligned on | 660 | i = 0; |
| 612 | * the receiver. | 661 | do { |
| 613 | */ | 662 | unsigned int len = 0; |
| 614 | 663 | ||
| 615 | /* handle a 0-len message */ | 664 | /* Set up the header */ |
| 616 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { | 665 | send->s_wr.send_flags = send_flags; |
| 617 | rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); | 666 | send->s_wr.opcode = IB_WR_SEND; |
| 618 | goto add_header; | 667 | send->s_wr.num_sge = 1; |
| 619 | } | 668 | send->s_wr.next = NULL; |
| 669 | send->s_queued = jiffies; | ||
| 670 | send->s_op = NULL; | ||
| 620 | 671 | ||
| 621 | /* if there's data reference it with a chain of work reqs */ | 672 | send->s_sge[0].addr = ic->i_send_hdrs_dma |
| 622 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | 673 | + (pos * sizeof(struct rds_header)); |
| 623 | unsigned int len; | 674 | send->s_sge[0].length = sizeof(struct rds_header); |
| 624 | 675 | ||
| 625 | send = &ic->i_sends[pos]; | 676 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); |
| 626 | 677 | ||
| 627 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); | 678 | /* Set up the data, if present */ |
| 628 | rds_ib_xmit_populate_wr(ic, send, pos, | 679 | if (i < work_alloc |
| 629 | ib_sg_dma_address(dev, scat) + off, len, | 680 | && scat != &rm->data.op_sg[rm->data.op_count]) { |
| 630 | send_flags); | 681 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); |
| 682 | send->s_wr.num_sge = 2; | ||
| 631 | 683 | ||
| 632 | /* | 684 | send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; |
| 633 | * We want to delay signaling completions just enough to get | 685 | send->s_sge[1].length = len; |
| 634 | * the batching benefits but not so much that we create dead time | ||
| 635 | * on the wire. | ||
| 636 | */ | ||
| 637 | if (ic->i_unsignaled_wrs-- == 0) { | ||
| 638 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
| 639 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
| 640 | } | ||
| 641 | 686 | ||
| 642 | ic->i_unsignaled_bytes -= len; | 687 | bytes_sent += len; |
| 643 | if (ic->i_unsignaled_bytes <= 0) { | 688 | off += len; |
| 644 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | 689 | if (off == ib_sg_dma_len(dev, scat)) { |
| 645 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 690 | scat++; |
| 691 | off = 0; | ||
| 692 | } | ||
| 646 | } | 693 | } |
| 647 | 694 | ||
| 695 | rds_ib_set_wr_signal_state(ic, send, 0); | ||
| 696 | |||
| 648 | /* | 697 | /* |
| 649 | * Always signal the last one if we're stopping due to flow control. | 698 | * Always signal the last one if we're stopping due to flow control. |
| 650 | */ | 699 | */ |
| 651 | if (flow_controlled && i == (work_alloc-1)) | 700 | if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) |
| 652 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 701 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
| 653 | 702 | ||
| 703 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
| 704 | nr_sig++; | ||
| 705 | |||
| 654 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | 706 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, |
| 655 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | 707 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); |
| 656 | 708 | ||
| 657 | sent += len; | 709 | if (ic->i_flowctl && adv_credits) { |
| 658 | off += len; | ||
| 659 | if (off == ib_sg_dma_len(dev, scat)) { | ||
| 660 | scat++; | ||
| 661 | off = 0; | ||
| 662 | } | ||
| 663 | |||
| 664 | add_header: | ||
| 665 | /* Tack on the header after the data. The header SGE should already | ||
| 666 | * have been set up to point to the right header buffer. */ | ||
| 667 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); | ||
| 668 | |||
| 669 | if (0) { | ||
| 670 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
| 671 | |||
| 672 | printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", | ||
| 673 | be16_to_cpu(hdr->h_dport), | ||
| 674 | hdr->h_flags, | ||
| 675 | be32_to_cpu(hdr->h_len)); | ||
| 676 | } | ||
| 677 | if (adv_credits) { | ||
| 678 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | 710 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; |
| 679 | 711 | ||
| 680 | /* add credit and redo the header checksum */ | 712 | /* add credit and redo the header checksum */ |
| @@ -689,20 +721,25 @@ add_header: | |||
| 689 | prev = send; | 721 | prev = send; |
| 690 | 722 | ||
| 691 | pos = (pos + 1) % ic->i_send_ring.w_nr; | 723 | pos = (pos + 1) % ic->i_send_ring.w_nr; |
| 692 | } | 724 | send = &ic->i_sends[pos]; |
| 725 | i++; | ||
| 726 | |||
| 727 | } while (i < work_alloc | ||
| 728 | && scat != &rm->data.op_sg[rm->data.op_count]); | ||
| 693 | 729 | ||
| 694 | /* Account the RDS header in the number of bytes we sent, but just once. | 730 | /* Account the RDS header in the number of bytes we sent, but just once. |
| 695 | * The caller has no concept of fragmentation. */ | 731 | * The caller has no concept of fragmentation. */ |
| 696 | if (hdr_off == 0) | 732 | if (hdr_off == 0) |
| 697 | sent += sizeof(struct rds_header); | 733 | bytes_sent += sizeof(struct rds_header); |
| 698 | 734 | ||
| 699 | /* if we finished the message then send completion owns it */ | 735 | /* if we finished the message then send completion owns it */ |
| 700 | if (scat == &rm->m_sg[rm->m_count]) { | 736 | if (scat == &rm->data.op_sg[rm->data.op_count]) { |
| 701 | prev->s_rm = ic->i_rm; | 737 | prev->s_op = ic->i_data_op; |
| 702 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 738 | prev->s_wr.send_flags |= IB_SEND_SOLICITED; |
| 703 | ic->i_rm = NULL; | 739 | ic->i_data_op = NULL; |
| 704 | } | 740 | } |
| 705 | 741 | ||
| 742 | /* Put back wrs & credits we didn't use */ | ||
| 706 | if (i < work_alloc) { | 743 | if (i < work_alloc) { |
| 707 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 744 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
| 708 | work_alloc = i; | 745 | work_alloc = i; |
| @@ -710,6 +747,9 @@ add_header: | |||
| 710 | if (ic->i_flowctl && i < credit_alloc) | 747 | if (ic->i_flowctl && i < credit_alloc) |
| 711 | rds_ib_send_add_credits(conn, credit_alloc - i); | 748 | rds_ib_send_add_credits(conn, credit_alloc - i); |
| 712 | 749 | ||
| 750 | if (nr_sig) | ||
| 751 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
| 752 | |||
| 713 | /* XXX need to worry about failed_wr and partial sends. */ | 753 | /* XXX need to worry about failed_wr and partial sends. */ |
| 714 | failed_wr = &first->s_wr; | 754 | failed_wr = &first->s_wr; |
| 715 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 755 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
| @@ -720,32 +760,127 @@ add_header: | |||
| 720 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " | 760 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " |
| 721 | "returned %d\n", &conn->c_faddr, ret); | 761 | "returned %d\n", &conn->c_faddr, ret); |
| 722 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 762 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 723 | if (prev->s_rm) { | 763 | rds_ib_sub_signaled(ic, nr_sig); |
| 724 | ic->i_rm = prev->s_rm; | 764 | if (prev->s_op) { |
| 725 | prev->s_rm = NULL; | 765 | ic->i_data_op = prev->s_op; |
| 766 | prev->s_op = NULL; | ||
| 726 | } | 767 | } |
| 727 | 768 | ||
| 728 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); | 769 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); |
| 729 | goto out; | 770 | goto out; |
| 730 | } | 771 | } |
| 731 | 772 | ||
| 732 | ret = sent; | 773 | ret = bytes_sent; |
| 733 | out: | 774 | out: |
| 734 | BUG_ON(adv_credits); | 775 | BUG_ON(adv_credits); |
| 735 | return ret; | 776 | return ret; |
| 736 | } | 777 | } |
| 737 | 778 | ||
| 738 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | 779 | /* |
| 780 | * Issue atomic operation. | ||
| 781 | * A simplified version of the rdma case, we always map 1 SG, and | ||
| 782 | * only 8 bytes, for the return value from the atomic operation. | ||
| 783 | */ | ||
| 784 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) | ||
| 785 | { | ||
| 786 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
| 787 | struct rds_ib_send_work *send = NULL; | ||
| 788 | struct ib_send_wr *failed_wr; | ||
| 789 | struct rds_ib_device *rds_ibdev; | ||
| 790 | u32 pos; | ||
| 791 | u32 work_alloc; | ||
| 792 | int ret; | ||
| 793 | int nr_sig = 0; | ||
| 794 | |||
| 795 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
| 796 | |||
| 797 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); | ||
| 798 | if (work_alloc != 1) { | ||
| 799 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
| 800 | rds_ib_stats_inc(s_ib_tx_ring_full); | ||
| 801 | ret = -ENOMEM; | ||
| 802 | goto out; | ||
| 803 | } | ||
| 804 | |||
| 805 | /* address of send request in ring */ | ||
| 806 | send = &ic->i_sends[pos]; | ||
| 807 | send->s_queued = jiffies; | ||
| 808 | |||
| 809 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { | ||
| 810 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; | ||
| 811 | send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; | ||
| 812 | send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; | ||
| 813 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; | ||
| 814 | send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; | ||
| 815 | } else { /* FADD */ | ||
| 816 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; | ||
| 817 | send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; | ||
| 818 | send->s_wr.wr.atomic.swap = 0; | ||
| 819 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; | ||
| 820 | send->s_wr.wr.atomic.swap_mask = 0; | ||
| 821 | } | ||
| 822 | nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); | ||
| 823 | send->s_wr.num_sge = 1; | ||
| 824 | send->s_wr.next = NULL; | ||
| 825 | send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; | ||
| 826 | send->s_wr.wr.atomic.rkey = op->op_rkey; | ||
| 827 | send->s_op = op; | ||
| 828 | rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); | ||
| 829 | |||
| 830 | /* map 8 byte retval buffer to the device */ | ||
| 831 | ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); | ||
| 832 | rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); | ||
| 833 | if (ret != 1) { | ||
| 834 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
| 835 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||
| 836 | ret = -ENOMEM; /* XXX ? */ | ||
| 837 | goto out; | ||
| 838 | } | ||
| 839 | |||
| 840 | /* Convert our struct scatterlist to struct ib_sge */ | ||
| 841 | send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); | ||
| 842 | send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); | ||
| 843 | send->s_sge[0].lkey = ic->i_mr->lkey; | ||
| 844 | |||
| 845 | rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, | ||
| 846 | send->s_sge[0].addr, send->s_sge[0].length); | ||
| 847 | |||
| 848 | if (nr_sig) | ||
| 849 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
| 850 | |||
| 851 | failed_wr = &send->s_wr; | ||
| 852 | ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); | ||
| 853 | rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, | ||
| 854 | send, &send->s_wr, ret, failed_wr); | ||
| 855 | BUG_ON(failed_wr != &send->s_wr); | ||
| 856 | if (ret) { | ||
| 857 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " | ||
| 858 | "returned %d\n", &conn->c_faddr, ret); | ||
| 859 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
| 860 | rds_ib_sub_signaled(ic, nr_sig); | ||
| 861 | goto out; | ||
| 862 | } | ||
| 863 | |||
| 864 | if (unlikely(failed_wr != &send->s_wr)) { | ||
| 865 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); | ||
| 866 | BUG_ON(failed_wr != &send->s_wr); | ||
| 867 | } | ||
| 868 | |||
| 869 | out: | ||
| 870 | return ret; | ||
| 871 | } | ||
| 872 | |||
| 873 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) | ||
| 739 | { | 874 | { |
| 740 | struct rds_ib_connection *ic = conn->c_transport_data; | 875 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 741 | struct rds_ib_send_work *send = NULL; | 876 | struct rds_ib_send_work *send = NULL; |
| 742 | struct rds_ib_send_work *first; | 877 | struct rds_ib_send_work *first; |
| 743 | struct rds_ib_send_work *prev; | 878 | struct rds_ib_send_work *prev; |
| 744 | struct ib_send_wr *failed_wr; | 879 | struct ib_send_wr *failed_wr; |
| 745 | struct rds_ib_device *rds_ibdev; | ||
| 746 | struct scatterlist *scat; | 880 | struct scatterlist *scat; |
| 747 | unsigned long len; | 881 | unsigned long len; |
| 748 | u64 remote_addr = op->r_remote_addr; | 882 | u64 remote_addr = op->op_remote_addr; |
| 883 | u32 max_sge = ic->rds_ibdev->max_sge; | ||
| 749 | u32 pos; | 884 | u32 pos; |
| 750 | u32 work_alloc; | 885 | u32 work_alloc; |
| 751 | u32 i; | 886 | u32 i; |
| @@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 753 | int sent; | 888 | int sent; |
| 754 | int ret; | 889 | int ret; |
| 755 | int num_sge; | 890 | int num_sge; |
| 756 | 891 | int nr_sig = 0; | |
| 757 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 892 | |
| 758 | 893 | /* map the op the first time we see it */ | |
| 759 | /* map the message the first time we see it */ | 894 | if (!op->op_mapped) { |
| 760 | if (!op->r_mapped) { | 895 | op->op_count = ib_dma_map_sg(ic->i_cm_id->device, |
| 761 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | 896 | op->op_sg, op->op_nents, (op->op_write) ? |
| 762 | op->r_sg, op->r_nents, (op->r_write) ? | 897 | DMA_TO_DEVICE : DMA_FROM_DEVICE); |
| 763 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | 898 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); |
| 764 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | 899 | if (op->op_count == 0) { |
| 765 | if (op->r_count == 0) { | ||
| 766 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 900 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
| 767 | ret = -ENOMEM; /* XXX ? */ | 901 | ret = -ENOMEM; /* XXX ? */ |
| 768 | goto out; | 902 | goto out; |
| 769 | } | 903 | } |
| 770 | 904 | ||
| 771 | op->r_mapped = 1; | 905 | op->op_mapped = 1; |
| 772 | } | 906 | } |
| 773 | 907 | ||
| 774 | /* | 908 | /* |
| 775 | * Instead of knowing how to return a partial rdma read/write we insist that there | 909 | * Instead of knowing how to return a partial rdma read/write we insist that there |
| 776 | * be enough work requests to send the entire message. | 910 | * be enough work requests to send the entire message. |
| 777 | */ | 911 | */ |
| 778 | i = ceil(op->r_count, rds_ibdev->max_sge); | 912 | i = ceil(op->op_count, max_sge); |
| 779 | 913 | ||
| 780 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); | 914 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); |
| 781 | if (work_alloc != i) { | 915 | if (work_alloc != i) { |
| @@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 788 | send = &ic->i_sends[pos]; | 922 | send = &ic->i_sends[pos]; |
| 789 | first = send; | 923 | first = send; |
| 790 | prev = NULL; | 924 | prev = NULL; |
| 791 | scat = &op->r_sg[0]; | 925 | scat = &op->op_sg[0]; |
| 792 | sent = 0; | 926 | sent = 0; |
| 793 | num_sge = op->r_count; | 927 | num_sge = op->op_count; |
| 794 | 928 | ||
| 795 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | 929 | for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { |
| 796 | send->s_wr.send_flags = 0; | 930 | send->s_wr.send_flags = 0; |
| 797 | send->s_queued = jiffies; | 931 | send->s_queued = jiffies; |
| 798 | /* | 932 | send->s_op = NULL; |
| 799 | * We want to delay signaling completions just enough to get | 933 | |
| 800 | * the batching benefits but not so much that we create dead time on the wire. | 934 | nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); |
| 801 | */ | ||
| 802 | if (ic->i_unsignaled_wrs-- == 0) { | ||
| 803 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
| 804 | send->s_wr.send_flags = IB_SEND_SIGNALED; | ||
| 805 | } | ||
| 806 | 935 | ||
| 807 | send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; | 936 | send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; |
| 808 | send->s_wr.wr.rdma.remote_addr = remote_addr; | 937 | send->s_wr.wr.rdma.remote_addr = remote_addr; |
| 809 | send->s_wr.wr.rdma.rkey = op->r_key; | 938 | send->s_wr.wr.rdma.rkey = op->op_rkey; |
| 810 | send->s_op = op; | ||
| 811 | 939 | ||
| 812 | if (num_sge > rds_ibdev->max_sge) { | 940 | if (num_sge > max_sge) { |
| 813 | send->s_wr.num_sge = rds_ibdev->max_sge; | 941 | send->s_wr.num_sge = max_sge; |
| 814 | num_sge -= rds_ibdev->max_sge; | 942 | num_sge -= max_sge; |
| 815 | } else { | 943 | } else { |
| 816 | send->s_wr.num_sge = num_sge; | 944 | send->s_wr.num_sge = num_sge; |
| 817 | } | 945 | } |
| @@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 821 | if (prev) | 949 | if (prev) |
| 822 | prev->s_wr.next = &send->s_wr; | 950 | prev->s_wr.next = &send->s_wr; |
| 823 | 951 | ||
| 824 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | 952 | for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { |
| 825 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | 953 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); |
| 826 | send->s_sge[j].addr = | 954 | send->s_sge[j].addr = |
| 827 | ib_sg_dma_address(ic->i_cm_id->device, scat); | 955 | ib_sg_dma_address(ic->i_cm_id->device, scat); |
| @@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 843 | send = ic->i_sends; | 971 | send = ic->i_sends; |
| 844 | } | 972 | } |
| 845 | 973 | ||
| 846 | /* if we finished the message then send completion owns it */ | 974 | /* give a reference to the last op */ |
| 847 | if (scat == &op->r_sg[op->r_count]) | 975 | if (scat == &op->op_sg[op->op_count]) { |
| 848 | prev->s_wr.send_flags = IB_SEND_SIGNALED; | 976 | prev->s_op = op; |
| 977 | rds_message_addref(container_of(op, struct rds_message, rdma)); | ||
| 978 | } | ||
| 849 | 979 | ||
| 850 | if (i < work_alloc) { | 980 | if (i < work_alloc) { |
| 851 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 981 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
| 852 | work_alloc = i; | 982 | work_alloc = i; |
| 853 | } | 983 | } |
| 854 | 984 | ||
| 985 | if (nr_sig) | ||
| 986 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
| 987 | |||
| 855 | failed_wr = &first->s_wr; | 988 | failed_wr = &first->s_wr; |
| 856 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 989 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
| 857 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | 990 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, |
| @@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 861 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " | 994 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " |
| 862 | "returned %d\n", &conn->c_faddr, ret); | 995 | "returned %d\n", &conn->c_faddr, ret); |
| 863 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 996 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 997 | rds_ib_sub_signaled(ic, nr_sig); | ||
| 864 | goto out; | 998 | goto out; |
| 865 | } | 999 | } |
| 866 | 1000 | ||
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d2c904dd6fbc..2d5965d6e97c 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c | |||
| @@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = { | |||
| 67 | "ib_rdma_mr_pool_flush", | 67 | "ib_rdma_mr_pool_flush", |
| 68 | "ib_rdma_mr_pool_wait", | 68 | "ib_rdma_mr_pool_wait", |
| 69 | "ib_rdma_mr_pool_depleted", | 69 | "ib_rdma_mr_pool_depleted", |
| 70 | "ib_atomic_cswp", | ||
| 71 | "ib_atomic_fadd", | ||
| 70 | }; | 72 | }; |
| 71 | 73 | ||
| 72 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | 74 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, |
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 03f01cb4e0fe..fc3da37220fd 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c | |||
| @@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; | |||
| 49 | static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; | 49 | static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; |
| 50 | static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; | 50 | static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; |
| 51 | 51 | ||
| 52 | unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); | ||
| 53 | static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; | ||
| 54 | static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; | ||
| 55 | |||
| 56 | /* | 52 | /* |
| 57 | * This sysctl does nothing. | 53 | * This sysctl does nothing. |
| 58 | * | 54 | * |
| @@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = { | |||
| 94 | .extra2 = &rds_ib_sysctl_max_unsig_wr_max, | 90 | .extra2 = &rds_ib_sysctl_max_unsig_wr_max, |
| 95 | }, | 91 | }, |
| 96 | { | 92 | { |
| 97 | .procname = "max_unsignaled_bytes", | ||
| 98 | .data = &rds_ib_sysctl_max_unsig_bytes, | ||
| 99 | .maxlen = sizeof(unsigned long), | ||
| 100 | .mode = 0644, | ||
| 101 | .proc_handler = proc_doulongvec_minmax, | ||
| 102 | .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, | ||
| 103 | .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, | ||
| 104 | }, | ||
| 105 | { | ||
| 106 | .procname = "max_recv_allocation", | 93 | .procname = "max_recv_allocation", |
| 107 | .data = &rds_ib_sysctl_max_recv_allocation, | 94 | .data = &rds_ib_sysctl_max_recv_allocation, |
| 108 | .maxlen = sizeof(unsigned long), | 95 | .maxlen = sizeof(unsigned long), |
| @@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void) | |||
| 132 | unregister_sysctl_table(rds_ib_sysctl_hdr); | 119 | unregister_sysctl_table(rds_ib_sysctl_hdr); |
| 133 | } | 120 | } |
| 134 | 121 | ||
| 135 | int __init rds_ib_sysctl_init(void) | 122 | int rds_ib_sysctl_init(void) |
| 136 | { | 123 | { |
| 137 | rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); | 124 | rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); |
| 138 | if (rds_ib_sysctl_hdr == NULL) | 125 | if (!rds_ib_sysctl_hdr) |
| 139 | return -ENOMEM; | 126 | return -ENOMEM; |
| 140 | return 0; | 127 | return 0; |
| 141 | } | 128 | } |
diff --git a/net/rds/info.c b/net/rds/info.c index c45c4173a44d..4fdf1b6e84ff 100644 --- a/net/rds/info.c +++ b/net/rds/info.c | |||
| @@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func) | |||
| 76 | BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); | 76 | BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); |
| 77 | 77 | ||
| 78 | spin_lock(&rds_info_lock); | 78 | spin_lock(&rds_info_lock); |
| 79 | BUG_ON(rds_info_funcs[offset] != NULL); | 79 | BUG_ON(rds_info_funcs[offset]); |
| 80 | rds_info_funcs[offset] = func; | 80 | rds_info_funcs[offset] = func; |
| 81 | spin_unlock(&rds_info_lock); | 81 | spin_unlock(&rds_info_lock); |
| 82 | } | 82 | } |
| @@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func); | |||
| 102 | */ | 102 | */ |
| 103 | void rds_info_iter_unmap(struct rds_info_iterator *iter) | 103 | void rds_info_iter_unmap(struct rds_info_iterator *iter) |
| 104 | { | 104 | { |
| 105 | if (iter->addr != NULL) { | 105 | if (iter->addr) { |
| 106 | kunmap_atomic(iter->addr, KM_USER0); | 106 | kunmap_atomic(iter->addr, KM_USER0); |
| 107 | iter->addr = NULL; | 107 | iter->addr = NULL; |
| 108 | } | 108 | } |
| @@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, | |||
| 117 | unsigned long this; | 117 | unsigned long this; |
| 118 | 118 | ||
| 119 | while (bytes) { | 119 | while (bytes) { |
| 120 | if (iter->addr == NULL) | 120 | if (!iter->addr) |
| 121 | iter->addr = kmap_atomic(*iter->pages, KM_USER0); | 121 | iter->addr = kmap_atomic(*iter->pages, KM_USER0); |
| 122 | 122 | ||
| 123 | this = min(bytes, PAGE_SIZE - iter->offset); | 123 | this = min(bytes, PAGE_SIZE - iter->offset); |
| @@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, | |||
| 188 | >> PAGE_SHIFT; | 188 | >> PAGE_SHIFT; |
| 189 | 189 | ||
| 190 | pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); | 190 | pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); |
| 191 | if (pages == NULL) { | 191 | if (!pages) { |
| 192 | ret = -ENOMEM; | 192 | ret = -ENOMEM; |
| 193 | goto out; | 193 | goto out; |
| 194 | } | 194 | } |
| @@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, | |||
| 206 | 206 | ||
| 207 | call_func: | 207 | call_func: |
| 208 | func = rds_info_funcs[optname - RDS_INFO_FIRST]; | 208 | func = rds_info_funcs[optname - RDS_INFO_FIRST]; |
| 209 | if (func == NULL) { | 209 | if (!func) { |
| 210 | ret = -ENOPROTOOPT; | 210 | ret = -ENOPROTOOPT; |
| 211 | goto out; | 211 | goto out; |
| 212 | } | 212 | } |
| @@ -234,7 +234,7 @@ call_func: | |||
| 234 | ret = -EFAULT; | 234 | ret = -EFAULT; |
| 235 | 235 | ||
| 236 | out: | 236 | out: |
| 237 | for (i = 0; pages != NULL && i < nr_pages; i++) | 237 | for (i = 0; pages && i < nr_pages; i++) |
| 238 | put_page(pages[i]); | 238 | put_page(pages[i]); |
| 239 | kfree(pages); | 239 | kfree(pages); |
| 240 | 240 | ||
diff --git a/net/rds/iw.c b/net/rds/iw.c index c8f3d3525cb9..56808cac0fc7 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c | |||
| @@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = { | |||
| 264 | .laddr_check = rds_iw_laddr_check, | 264 | .laddr_check = rds_iw_laddr_check, |
| 265 | .xmit_complete = rds_iw_xmit_complete, | 265 | .xmit_complete = rds_iw_xmit_complete, |
| 266 | .xmit = rds_iw_xmit, | 266 | .xmit = rds_iw_xmit, |
| 267 | .xmit_cong_map = NULL, | ||
| 268 | .xmit_rdma = rds_iw_xmit_rdma, | 267 | .xmit_rdma = rds_iw_xmit_rdma, |
| 269 | .recv = rds_iw_recv, | 268 | .recv = rds_iw_recv, |
| 270 | .conn_alloc = rds_iw_conn_alloc, | 269 | .conn_alloc = rds_iw_conn_alloc, |
| @@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = { | |||
| 272 | .conn_connect = rds_iw_conn_connect, | 271 | .conn_connect = rds_iw_conn_connect, |
| 273 | .conn_shutdown = rds_iw_conn_shutdown, | 272 | .conn_shutdown = rds_iw_conn_shutdown, |
| 274 | .inc_copy_to_user = rds_iw_inc_copy_to_user, | 273 | .inc_copy_to_user = rds_iw_inc_copy_to_user, |
| 275 | .inc_purge = rds_iw_inc_purge, | ||
| 276 | .inc_free = rds_iw_inc_free, | 274 | .inc_free = rds_iw_inc_free, |
| 277 | .cm_initiate_connect = rds_iw_cm_initiate_connect, | 275 | .cm_initiate_connect = rds_iw_cm_initiate_connect, |
| 278 | .cm_handle_connect = rds_iw_cm_handle_connect, | 276 | .cm_handle_connect = rds_iw_cm_handle_connect, |
| @@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = { | |||
| 289 | .t_prefer_loopback = 1, | 287 | .t_prefer_loopback = 1, |
| 290 | }; | 288 | }; |
| 291 | 289 | ||
| 292 | int __init rds_iw_init(void) | 290 | int rds_iw_init(void) |
| 293 | { | 291 | { |
| 294 | int ret; | 292 | int ret; |
| 295 | 293 | ||
diff --git a/net/rds/iw.h b/net/rds/iw.h index eef2f0c28476..543e665fafe3 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h | |||
| @@ -70,7 +70,7 @@ struct rds_iw_send_work { | |||
| 70 | struct rds_message *s_rm; | 70 | struct rds_message *s_rm; |
| 71 | 71 | ||
| 72 | /* We should really put these into a union: */ | 72 | /* We should really put these into a union: */ |
| 73 | struct rds_rdma_op *s_op; | 73 | struct rm_rdma_op *s_op; |
| 74 | struct rds_iw_mapping *s_mapping; | 74 | struct rds_iw_mapping *s_mapping; |
| 75 | struct ib_mr *s_mr; | 75 | struct ib_mr *s_mr; |
| 76 | struct ib_fast_reg_page_list *s_page_list; | 76 | struct ib_fast_reg_page_list *s_page_list; |
| @@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg); | |||
| 284 | int rds_iw_conn_connect(struct rds_connection *conn); | 284 | int rds_iw_conn_connect(struct rds_connection *conn); |
| 285 | void rds_iw_conn_shutdown(struct rds_connection *conn); | 285 | void rds_iw_conn_shutdown(struct rds_connection *conn); |
| 286 | void rds_iw_state_change(struct sock *sk); | 286 | void rds_iw_state_change(struct sock *sk); |
| 287 | int __init rds_iw_listen_init(void); | 287 | int rds_iw_listen_init(void); |
| 288 | void rds_iw_listen_stop(void); | 288 | void rds_iw_listen_stop(void); |
| 289 | void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); | 289 | void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); |
| 290 | int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, | 290 | int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, |
| @@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void); | |||
| 321 | void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); | 321 | void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); |
| 322 | 322 | ||
| 323 | /* ib_recv.c */ | 323 | /* ib_recv.c */ |
| 324 | int __init rds_iw_recv_init(void); | 324 | int rds_iw_recv_init(void); |
| 325 | void rds_iw_recv_exit(void); | 325 | void rds_iw_recv_exit(void); |
| 326 | int rds_iw_recv(struct rds_connection *conn); | 326 | int rds_iw_recv(struct rds_connection *conn); |
| 327 | int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 327 | int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, |
| 328 | gfp_t page_gfp, int prefill); | 328 | gfp_t page_gfp, int prefill); |
| 329 | void rds_iw_inc_purge(struct rds_incoming *inc); | ||
| 330 | void rds_iw_inc_free(struct rds_incoming *inc); | 329 | void rds_iw_inc_free(struct rds_incoming *inc); |
| 331 | int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | 330 | int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, |
| 332 | size_t size); | 331 | size_t size); |
| @@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 358 | void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); | 357 | void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); |
| 359 | void rds_iw_send_init_ring(struct rds_iw_connection *ic); | 358 | void rds_iw_send_init_ring(struct rds_iw_connection *ic); |
| 360 | void rds_iw_send_clear_ring(struct rds_iw_connection *ic); | 359 | void rds_iw_send_clear_ring(struct rds_iw_connection *ic); |
| 361 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | 360 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); |
| 362 | void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); | 361 | void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); |
| 363 | void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); | 362 | void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); |
| 364 | int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, | 363 | int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, |
| @@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, | |||
| 371 | unsigned int avail); | 370 | unsigned int avail); |
| 372 | 371 | ||
| 373 | /* ib_sysctl.c */ | 372 | /* ib_sysctl.c */ |
| 374 | int __init rds_iw_sysctl_init(void); | 373 | int rds_iw_sysctl_init(void); |
| 375 | void rds_iw_sysctl_exit(void); | 374 | void rds_iw_sysctl_exit(void); |
| 376 | extern unsigned long rds_iw_sysctl_max_send_wr; | 375 | extern unsigned long rds_iw_sysctl_max_send_wr; |
| 377 | extern unsigned long rds_iw_sysctl_max_recv_wr; | 376 | extern unsigned long rds_iw_sysctl_max_recv_wr; |
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index b5dd6ac39be8..712cf2d1f28e 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c | |||
| @@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
| 257 | * the rds_iwdev at all. | 257 | * the rds_iwdev at all. |
| 258 | */ | 258 | */ |
| 259 | rds_iwdev = ib_get_client_data(dev, &rds_iw_client); | 259 | rds_iwdev = ib_get_client_data(dev, &rds_iw_client); |
| 260 | if (rds_iwdev == NULL) { | 260 | if (!rds_iwdev) { |
| 261 | if (printk_ratelimit()) | 261 | if (printk_ratelimit()) |
| 262 | printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", | 262 | printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", |
| 263 | dev->name); | 263 | dev->name); |
| @@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
| 292 | ic->i_send_ring.w_nr * | 292 | ic->i_send_ring.w_nr * |
| 293 | sizeof(struct rds_header), | 293 | sizeof(struct rds_header), |
| 294 | &ic->i_send_hdrs_dma, GFP_KERNEL); | 294 | &ic->i_send_hdrs_dma, GFP_KERNEL); |
| 295 | if (ic->i_send_hdrs == NULL) { | 295 | if (!ic->i_send_hdrs) { |
| 296 | ret = -ENOMEM; | 296 | ret = -ENOMEM; |
| 297 | rdsdebug("ib_dma_alloc_coherent send failed\n"); | 297 | rdsdebug("ib_dma_alloc_coherent send failed\n"); |
| 298 | goto out; | 298 | goto out; |
| @@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
| 302 | ic->i_recv_ring.w_nr * | 302 | ic->i_recv_ring.w_nr * |
| 303 | sizeof(struct rds_header), | 303 | sizeof(struct rds_header), |
| 304 | &ic->i_recv_hdrs_dma, GFP_KERNEL); | 304 | &ic->i_recv_hdrs_dma, GFP_KERNEL); |
| 305 | if (ic->i_recv_hdrs == NULL) { | 305 | if (!ic->i_recv_hdrs) { |
| 306 | ret = -ENOMEM; | 306 | ret = -ENOMEM; |
| 307 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); | 307 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); |
| 308 | goto out; | 308 | goto out; |
| @@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
| 310 | 310 | ||
| 311 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | 311 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), |
| 312 | &ic->i_ack_dma, GFP_KERNEL); | 312 | &ic->i_ack_dma, GFP_KERNEL); |
| 313 | if (ic->i_ack == NULL) { | 313 | if (!ic->i_ack) { |
| 314 | ret = -ENOMEM; | 314 | ret = -ENOMEM; |
| 315 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); | 315 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); |
| 316 | goto out; | 316 | goto out; |
| 317 | } | 317 | } |
| 318 | 318 | ||
| 319 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); | 319 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); |
| 320 | if (ic->i_sends == NULL) { | 320 | if (!ic->i_sends) { |
| 321 | ret = -ENOMEM; | 321 | ret = -ENOMEM; |
| 322 | rdsdebug("send allocation failed\n"); | 322 | rdsdebug("send allocation failed\n"); |
| 323 | goto out; | 323 | goto out; |
| @@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
| 325 | rds_iw_send_init_ring(ic); | 325 | rds_iw_send_init_ring(ic); |
| 326 | 326 | ||
| 327 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); | 327 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); |
| 328 | if (ic->i_recvs == NULL) { | 328 | if (!ic->i_recvs) { |
| 329 | ret = -ENOMEM; | 329 | ret = -ENOMEM; |
| 330 | rdsdebug("recv allocation failed\n"); | 330 | rdsdebug("recv allocation failed\n"); |
| 331 | goto out; | 331 | goto out; |
| @@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
| 696 | 696 | ||
| 697 | /* XXX too lazy? */ | 697 | /* XXX too lazy? */ |
| 698 | ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); | 698 | ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); |
| 699 | if (ic == NULL) | 699 | if (!ic) |
| 700 | return -ENOMEM; | 700 | return -ENOMEM; |
| 701 | 701 | ||
| 702 | INIT_LIST_HEAD(&ic->iw_node); | 702 | INIT_LIST_HEAD(&ic->iw_node); |
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 13dc1862d862..4e152e2daa3d 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c | |||
| @@ -34,7 +34,6 @@ | |||
| 34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
| 35 | 35 | ||
| 36 | #include "rds.h" | 36 | #include "rds.h" |
| 37 | #include "rdma.h" | ||
| 38 | #include "iw.h" | 37 | #include "iw.h" |
| 39 | 38 | ||
| 40 | 39 | ||
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 3d479067d54d..5e57347f49ff 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c | |||
| @@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) | |||
| 53 | static void rds_iw_frag_free(struct rds_page_frag *frag) | 53 | static void rds_iw_frag_free(struct rds_page_frag *frag) |
| 54 | { | 54 | { |
| 55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | 55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); |
| 56 | BUG_ON(frag->f_page != NULL); | 56 | BUG_ON(frag->f_page); |
| 57 | kmem_cache_free(rds_iw_frag_slab, frag); | 57 | kmem_cache_free(rds_iw_frag_slab, frag); |
| 58 | } | 58 | } |
| 59 | 59 | ||
| @@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, | |||
| 143 | struct ib_sge *sge; | 143 | struct ib_sge *sge; |
| 144 | int ret = -ENOMEM; | 144 | int ret = -ENOMEM; |
| 145 | 145 | ||
| 146 | if (recv->r_iwinc == NULL) { | 146 | if (!recv->r_iwinc) { |
| 147 | if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { | 147 | if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { |
| 148 | rds_iw_stats_inc(s_iw_rx_alloc_limit); | 148 | rds_iw_stats_inc(s_iw_rx_alloc_limit); |
| 149 | goto out; | 149 | goto out; |
| 150 | } | 150 | } |
| 151 | recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, | 151 | recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, |
| 152 | kptr_gfp); | 152 | kptr_gfp); |
| 153 | if (recv->r_iwinc == NULL) { | 153 | if (!recv->r_iwinc) { |
| 154 | atomic_dec(&rds_iw_allocation); | 154 | atomic_dec(&rds_iw_allocation); |
| 155 | goto out; | 155 | goto out; |
| 156 | } | 156 | } |
| @@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, | |||
| 158 | rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); | 158 | rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); |
| 159 | } | 159 | } |
| 160 | 160 | ||
| 161 | if (recv->r_frag == NULL) { | 161 | if (!recv->r_frag) { |
| 162 | recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); | 162 | recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); |
| 163 | if (recv->r_frag == NULL) | 163 | if (!recv->r_frag) |
| 164 | goto out; | 164 | goto out; |
| 165 | INIT_LIST_HEAD(&recv->r_frag->f_item); | 165 | INIT_LIST_HEAD(&recv->r_frag->f_item); |
| 166 | recv->r_frag->f_page = NULL; | 166 | recv->r_frag->f_page = NULL; |
| 167 | } | 167 | } |
| 168 | 168 | ||
| 169 | if (ic->i_frag.f_page == NULL) { | 169 | if (!ic->i_frag.f_page) { |
| 170 | ic->i_frag.f_page = alloc_page(page_gfp); | 170 | ic->i_frag.f_page = alloc_page(page_gfp); |
| 171 | if (ic->i_frag.f_page == NULL) | 171 | if (!ic->i_frag.f_page) |
| 172 | goto out; | 172 | goto out; |
| 173 | ic->i_frag.f_offset = 0; | 173 | ic->i_frag.f_offset = 0; |
| 174 | } | 174 | } |
| @@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
| 273 | return ret; | 273 | return ret; |
| 274 | } | 274 | } |
| 275 | 275 | ||
| 276 | void rds_iw_inc_purge(struct rds_incoming *inc) | 276 | static void rds_iw_inc_purge(struct rds_incoming *inc) |
| 277 | { | 277 | { |
| 278 | struct rds_iw_incoming *iwinc; | 278 | struct rds_iw_incoming *iwinc; |
| 279 | struct rds_page_frag *frag; | 279 | struct rds_page_frag *frag; |
| @@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, | |||
| 716 | * into the inc and save the inc so we can hang upcoming fragments | 716 | * into the inc and save the inc so we can hang upcoming fragments |
| 717 | * off its list. | 717 | * off its list. |
| 718 | */ | 718 | */ |
| 719 | if (iwinc == NULL) { | 719 | if (!iwinc) { |
| 720 | iwinc = recv->r_iwinc; | 720 | iwinc = recv->r_iwinc; |
| 721 | recv->r_iwinc = NULL; | 721 | recv->r_iwinc = NULL; |
| 722 | ic->i_iwinc = iwinc; | 722 | ic->i_iwinc = iwinc; |
| @@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn) | |||
| 887 | return ret; | 887 | return ret; |
| 888 | } | 888 | } |
| 889 | 889 | ||
| 890 | int __init rds_iw_recv_init(void) | 890 | int rds_iw_recv_init(void) |
| 891 | { | 891 | { |
| 892 | struct sysinfo si; | 892 | struct sysinfo si; |
| 893 | int ret = -ENOMEM; | 893 | int ret = -ENOMEM; |
| @@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void) | |||
| 899 | rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", | 899 | rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", |
| 900 | sizeof(struct rds_iw_incoming), | 900 | sizeof(struct rds_iw_incoming), |
| 901 | 0, 0, NULL); | 901 | 0, 0, NULL); |
| 902 | if (rds_iw_incoming_slab == NULL) | 902 | if (!rds_iw_incoming_slab) |
| 903 | goto out; | 903 | goto out; |
| 904 | 904 | ||
| 905 | rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", | 905 | rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", |
| 906 | sizeof(struct rds_page_frag), | 906 | sizeof(struct rds_page_frag), |
| 907 | 0, 0, NULL); | 907 | 0, 0, NULL); |
| 908 | if (rds_iw_frag_slab == NULL) | 908 | if (!rds_iw_frag_slab) |
| 909 | kmem_cache_destroy(rds_iw_incoming_slab); | 909 | kmem_cache_destroy(rds_iw_incoming_slab); |
| 910 | else | 910 | else |
| 911 | ret = 0; | 911 | ret = 0; |
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 52182ff7519e..6280ea020d4e 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c | |||
| @@ -36,7 +36,6 @@ | |||
| 36 | #include <linux/dmapool.h> | 36 | #include <linux/dmapool.h> |
| 37 | 37 | ||
| 38 | #include "rds.h" | 38 | #include "rds.h" |
| 39 | #include "rdma.h" | ||
| 40 | #include "iw.h" | 39 | #include "iw.h" |
| 41 | 40 | ||
| 42 | static void rds_iw_send_rdma_complete(struct rds_message *rm, | 41 | static void rds_iw_send_rdma_complete(struct rds_message *rm, |
| @@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, | |||
| 64 | } | 63 | } |
| 65 | 64 | ||
| 66 | static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, | 65 | static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, |
| 67 | struct rds_rdma_op *op) | 66 | struct rm_rdma_op *op) |
| 68 | { | 67 | { |
| 69 | if (op->r_mapped) { | 68 | if (op->op_mapped) { |
| 70 | ib_dma_unmap_sg(ic->i_cm_id->device, | 69 | ib_dma_unmap_sg(ic->i_cm_id->device, |
| 71 | op->r_sg, op->r_nents, | 70 | op->op_sg, op->op_nents, |
| 72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | 71 | op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
| 73 | op->r_mapped = 0; | 72 | op->op_mapped = 0; |
| 74 | } | 73 | } |
| 75 | } | 74 | } |
| 76 | 75 | ||
| @@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, | |||
| 83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | 82 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); |
| 84 | 83 | ||
| 85 | ib_dma_unmap_sg(ic->i_cm_id->device, | 84 | ib_dma_unmap_sg(ic->i_cm_id->device, |
| 86 | rm->m_sg, rm->m_nents, | 85 | rm->data.op_sg, rm->data.op_nents, |
| 87 | DMA_TO_DEVICE); | 86 | DMA_TO_DEVICE); |
| 88 | 87 | ||
| 89 | if (rm->m_rdma_op != NULL) { | 88 | if (rm->rdma.op_active) { |
| 90 | rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); | 89 | rds_iw_send_unmap_rdma(ic, &rm->rdma); |
| 91 | 90 | ||
| 92 | /* If the user asked for a completion notification on this | 91 | /* If the user asked for a completion notification on this |
| 93 | * message, we can implement three different semantics: | 92 | * message, we can implement three different semantics: |
| @@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, | |||
| 111 | */ | 110 | */ |
| 112 | rds_iw_send_rdma_complete(rm, wc_status); | 111 | rds_iw_send_rdma_complete(rm, wc_status); |
| 113 | 112 | ||
| 114 | if (rm->m_rdma_op->r_write) | 113 | if (rm->rdma.op_write) |
| 115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | 114 | rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); |
| 116 | else | 115 | else |
| 117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | 116 | rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); |
| 118 | } | 117 | } |
| 119 | 118 | ||
| 120 | /* If anyone waited for this message to get flushed out, wake | 119 | /* If anyone waited for this message to get flushed out, wake |
| @@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 556 | } | 555 | } |
| 557 | 556 | ||
| 558 | /* map the message the first time we see it */ | 557 | /* map the message the first time we see it */ |
| 559 | if (ic->i_rm == NULL) { | 558 | if (!ic->i_rm) { |
| 560 | /* | 559 | /* |
| 561 | printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", | 560 | printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", |
| 562 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | 561 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), |
| 563 | rm->m_inc.i_hdr.h_flags, | 562 | rm->m_inc.i_hdr.h_flags, |
| 564 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | 563 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); |
| 565 | */ | 564 | */ |
| 566 | if (rm->m_nents) { | 565 | if (rm->data.op_nents) { |
| 567 | rm->m_count = ib_dma_map_sg(dev, | 566 | rm->data.op_count = ib_dma_map_sg(dev, |
| 568 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | 567 | rm->data.op_sg, |
| 569 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | 568 | rm->data.op_nents, |
| 570 | if (rm->m_count == 0) { | 569 | DMA_TO_DEVICE); |
| 570 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); | ||
| 571 | if (rm->data.op_count == 0) { | ||
| 571 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); | 572 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); |
| 572 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | 573 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 573 | ret = -ENOMEM; /* XXX ? */ | 574 | ret = -ENOMEM; /* XXX ? */ |
| 574 | goto out; | 575 | goto out; |
| 575 | } | 576 | } |
| 576 | } else { | 577 | } else { |
| 577 | rm->m_count = 0; | 578 | rm->data.op_count = 0; |
| 578 | } | 579 | } |
| 579 | 580 | ||
| 580 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; | 581 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; |
| @@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 590 | 591 | ||
| 591 | /* If it has a RDMA op, tell the peer we did it. This is | 592 | /* If it has a RDMA op, tell the peer we did it. This is |
| 592 | * used by the peer to release use-once RDMA MRs. */ | 593 | * used by the peer to release use-once RDMA MRs. */ |
| 593 | if (rm->m_rdma_op) { | 594 | if (rm->rdma.op_active) { |
| 594 | struct rds_ext_header_rdma ext_hdr; | 595 | struct rds_ext_header_rdma ext_hdr; |
| 595 | 596 | ||
| 596 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | 597 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); |
| 597 | rds_message_add_extension(&rm->m_inc.i_hdr, | 598 | rds_message_add_extension(&rm->m_inc.i_hdr, |
| 598 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | 599 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); |
| 599 | } | 600 | } |
| @@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 621 | send = &ic->i_sends[pos]; | 622 | send = &ic->i_sends[pos]; |
| 622 | first = send; | 623 | first = send; |
| 623 | prev = NULL; | 624 | prev = NULL; |
| 624 | scat = &rm->m_sg[sg]; | 625 | scat = &rm->data.op_sg[sg]; |
| 625 | sent = 0; | 626 | sent = 0; |
| 626 | i = 0; | 627 | i = 0; |
| 627 | 628 | ||
| @@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 631 | * or when requested by the user. Right now, we let | 632 | * or when requested by the user. Right now, we let |
| 632 | * the application choose. | 633 | * the application choose. |
| 633 | */ | 634 | */ |
| 634 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | 635 | if (rm->rdma.op_active && rm->rdma.op_fence) |
| 635 | send_flags = IB_SEND_FENCE; | 636 | send_flags = IB_SEND_FENCE; |
| 636 | 637 | ||
| 637 | /* | 638 | /* |
| @@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 650 | } | 651 | } |
| 651 | 652 | ||
| 652 | /* if there's data reference it with a chain of work reqs */ | 653 | /* if there's data reference it with a chain of work reqs */ |
| 653 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | 654 | for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { |
| 654 | unsigned int len; | 655 | unsigned int len; |
| 655 | 656 | ||
| 656 | send = &ic->i_sends[pos]; | 657 | send = &ic->i_sends[pos]; |
| @@ -728,7 +729,7 @@ add_header: | |||
| 728 | sent += sizeof(struct rds_header); | 729 | sent += sizeof(struct rds_header); |
| 729 | 730 | ||
| 730 | /* if we finished the message then send completion owns it */ | 731 | /* if we finished the message then send completion owns it */ |
| 731 | if (scat == &rm->m_sg[rm->m_count]) { | 732 | if (scat == &rm->data.op_sg[rm->data.op_count]) { |
| 732 | prev->s_rm = ic->i_rm; | 733 | prev->s_rm = ic->i_rm; |
| 733 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 734 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
| 734 | ic->i_rm = NULL; | 735 | ic->i_rm = NULL; |
| @@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd | |||
| 784 | ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); | 785 | ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); |
| 785 | } | 786 | } |
| 786 | 787 | ||
| 787 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | 788 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) |
| 788 | { | 789 | { |
| 789 | struct rds_iw_connection *ic = conn->c_transport_data; | 790 | struct rds_iw_connection *ic = conn->c_transport_data; |
| 790 | struct rds_iw_send_work *send = NULL; | 791 | struct rds_iw_send_work *send = NULL; |
| @@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 794 | struct rds_iw_device *rds_iwdev; | 795 | struct rds_iw_device *rds_iwdev; |
| 795 | struct scatterlist *scat; | 796 | struct scatterlist *scat; |
| 796 | unsigned long len; | 797 | unsigned long len; |
| 797 | u64 remote_addr = op->r_remote_addr; | 798 | u64 remote_addr = op->op_remote_addr; |
| 798 | u32 pos, fr_pos; | 799 | u32 pos, fr_pos; |
| 799 | u32 work_alloc; | 800 | u32 work_alloc; |
| 800 | u32 i; | 801 | u32 i; |
| @@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 806 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); | 807 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); |
| 807 | 808 | ||
| 808 | /* map the message the first time we see it */ | 809 | /* map the message the first time we see it */ |
| 809 | if (!op->r_mapped) { | 810 | if (!op->op_mapped) { |
| 810 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | 811 | op->op_count = ib_dma_map_sg(ic->i_cm_id->device, |
| 811 | op->r_sg, op->r_nents, (op->r_write) ? | 812 | op->op_sg, op->op_nents, (op->op_write) ? |
| 812 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | 813 | DMA_TO_DEVICE : DMA_FROM_DEVICE); |
| 813 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | 814 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); |
| 814 | if (op->r_count == 0) { | 815 | if (op->op_count == 0) { |
| 815 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); | 816 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); |
| 816 | ret = -ENOMEM; /* XXX ? */ | 817 | ret = -ENOMEM; /* XXX ? */ |
| 817 | goto out; | 818 | goto out; |
| 818 | } | 819 | } |
| 819 | 820 | ||
| 820 | op->r_mapped = 1; | 821 | op->op_mapped = 1; |
| 821 | } | 822 | } |
| 822 | 823 | ||
| 823 | if (!op->r_write) { | 824 | if (!op->op_write) { |
| 824 | /* Alloc space on the send queue for the fastreg */ | 825 | /* Alloc space on the send queue for the fastreg */ |
| 825 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); | 826 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); |
| 826 | if (work_alloc != 1) { | 827 | if (work_alloc != 1) { |
| @@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 835 | * Instead of knowing how to return a partial rdma read/write we insist that there | 836 | * Instead of knowing how to return a partial rdma read/write we insist that there |
| 836 | * be enough work requests to send the entire message. | 837 | * be enough work requests to send the entire message. |
| 837 | */ | 838 | */ |
| 838 | i = ceil(op->r_count, rds_iwdev->max_sge); | 839 | i = ceil(op->op_count, rds_iwdev->max_sge); |
| 839 | 840 | ||
| 840 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); | 841 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); |
| 841 | if (work_alloc != i) { | 842 | if (work_alloc != i) { |
| @@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 846 | } | 847 | } |
| 847 | 848 | ||
| 848 | send = &ic->i_sends[pos]; | 849 | send = &ic->i_sends[pos]; |
| 849 | if (!op->r_write) { | 850 | if (!op->op_write) { |
| 850 | first = prev = &ic->i_sends[fr_pos]; | 851 | first = prev = &ic->i_sends[fr_pos]; |
| 851 | } else { | 852 | } else { |
| 852 | first = send; | 853 | first = send; |
| 853 | prev = NULL; | 854 | prev = NULL; |
| 854 | } | 855 | } |
| 855 | scat = &op->r_sg[0]; | 856 | scat = &op->op_sg[0]; |
| 856 | sent = 0; | 857 | sent = 0; |
| 857 | num_sge = op->r_count; | 858 | num_sge = op->op_count; |
| 858 | 859 | ||
| 859 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | 860 | for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { |
| 860 | send->s_wr.send_flags = 0; | 861 | send->s_wr.send_flags = 0; |
| 861 | send->s_queued = jiffies; | 862 | send->s_queued = jiffies; |
| 862 | 863 | ||
| @@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 873 | * for local access after RDS is finished with it, using | 874 | * for local access after RDS is finished with it, using |
| 874 | * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. | 875 | * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. |
| 875 | */ | 876 | */ |
| 876 | if (op->r_write) | 877 | if (op->op_write) |
| 877 | send->s_wr.opcode = IB_WR_RDMA_WRITE; | 878 | send->s_wr.opcode = IB_WR_RDMA_WRITE; |
| 878 | else | 879 | else |
| 879 | send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; | 880 | send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; |
| 880 | 881 | ||
| 881 | send->s_wr.wr.rdma.remote_addr = remote_addr; | 882 | send->s_wr.wr.rdma.remote_addr = remote_addr; |
| 882 | send->s_wr.wr.rdma.rkey = op->r_key; | 883 | send->s_wr.wr.rdma.rkey = op->op_rkey; |
| 883 | send->s_op = op; | 884 | send->s_op = op; |
| 884 | 885 | ||
| 885 | if (num_sge > rds_iwdev->max_sge) { | 886 | if (num_sge > rds_iwdev->max_sge) { |
| @@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 893 | if (prev) | 894 | if (prev) |
| 894 | prev->s_wr.next = &send->s_wr; | 895 | prev->s_wr.next = &send->s_wr; |
| 895 | 896 | ||
| 896 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | 897 | for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { |
| 897 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | 898 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); |
| 898 | 899 | ||
| 899 | if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) | 900 | if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) |
| @@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 927 | } | 928 | } |
| 928 | 929 | ||
| 929 | /* if we finished the message then send completion owns it */ | 930 | /* if we finished the message then send completion owns it */ |
| 930 | if (scat == &op->r_sg[op->r_count]) | 931 | if (scat == &op->op_sg[op->op_count]) |
| 931 | first->s_wr.send_flags = IB_SEND_SIGNALED; | 932 | first->s_wr.send_flags = IB_SEND_SIGNALED; |
| 932 | 933 | ||
| 933 | if (i < work_alloc) { | 934 | if (i < work_alloc) { |
| @@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 941 | * adapters do not allow using the lkey for this at all. To bypass this use a | 942 | * adapters do not allow using the lkey for this at all. To bypass this use a |
| 942 | * fastreg_mr (or possibly a dma_mr) | 943 | * fastreg_mr (or possibly a dma_mr) |
| 943 | */ | 944 | */ |
| 944 | if (!op->r_write) { | 945 | if (!op->op_write) { |
| 945 | rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], | 946 | rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], |
| 946 | op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); | 947 | op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); |
| 947 | work_alloc++; | 948 | work_alloc++; |
| 948 | } | 949 | } |
| 949 | 950 | ||
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 1c4428a61a02..23e3a9a26aaf 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c | |||
| @@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void) | |||
| 122 | unregister_sysctl_table(rds_iw_sysctl_hdr); | 122 | unregister_sysctl_table(rds_iw_sysctl_hdr); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | int __init rds_iw_sysctl_init(void) | 125 | int rds_iw_sysctl_init(void) |
| 126 | { | 126 | { |
| 127 | rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); | 127 | rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); |
| 128 | if (rds_iw_sysctl_hdr == NULL) | 128 | if (!rds_iw_sysctl_hdr) |
| 129 | return -ENOMEM; | 129 | return -ENOMEM; |
| 130 | return 0; | 130 | return 0; |
| 131 | } | 131 | } |
diff --git a/net/rds/loop.c b/net/rds/loop.c index dd9879379457..c390156b426f 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c | |||
| @@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 61 | unsigned int hdr_off, unsigned int sg, | 61 | unsigned int hdr_off, unsigned int sg, |
| 62 | unsigned int off) | 62 | unsigned int off) |
| 63 | { | 63 | { |
| 64 | /* Do not send cong updates to loopback */ | ||
| 65 | if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { | ||
| 66 | rds_cong_map_updated(conn->c_fcong, ~(u64) 0); | ||
| 67 | return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; | ||
| 68 | } | ||
| 69 | |||
| 64 | BUG_ON(hdr_off || sg || off); | 70 | BUG_ON(hdr_off || sg || off); |
| 65 | 71 | ||
| 66 | rds_inc_init(&rm->m_inc, conn, conn->c_laddr); | 72 | rds_inc_init(&rm->m_inc, conn, conn->c_laddr); |
| 67 | rds_message_addref(rm); /* for the inc */ | 73 | /* For the embedded inc. Matching put is in loop_inc_free() */ |
| 74 | rds_message_addref(rm); | ||
| 68 | 75 | ||
| 69 | rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, | 76 | rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, |
| 70 | GFP_KERNEL, KM_USER0); | 77 | GFP_KERNEL, KM_USER0); |
| @@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 77 | return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); | 84 | return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); |
| 78 | } | 85 | } |
| 79 | 86 | ||
| 80 | static int rds_loop_xmit_cong_map(struct rds_connection *conn, | 87 | /* |
| 81 | struct rds_cong_map *map, | 88 | * See rds_loop_xmit(). Since our inc is embedded in the rm, we |
| 82 | unsigned long offset) | 89 | * make sure the rm lives at least until the inc is done. |
| 90 | */ | ||
| 91 | static void rds_loop_inc_free(struct rds_incoming *inc) | ||
| 83 | { | 92 | { |
| 84 | BUG_ON(offset); | 93 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); |
| 85 | BUG_ON(map != conn->c_lcong); | 94 | rds_message_put(rm); |
| 86 | |||
| 87 | rds_cong_map_updated(conn->c_fcong, ~(u64) 0); | ||
| 88 | |||
| 89 | return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; | ||
| 90 | } | 95 | } |
| 91 | 96 | ||
| 92 | /* we need to at least give the thread something to succeed */ | 97 | /* we need to at least give the thread something to succeed */ |
| @@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
| 112 | unsigned long flags; | 117 | unsigned long flags; |
| 113 | 118 | ||
| 114 | lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); | 119 | lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); |
| 115 | if (lc == NULL) | 120 | if (!lc) |
| 116 | return -ENOMEM; | 121 | return -ENOMEM; |
| 117 | 122 | ||
| 118 | INIT_LIST_HEAD(&lc->loop_node); | 123 | INIT_LIST_HEAD(&lc->loop_node); |
| @@ -169,14 +174,12 @@ void rds_loop_exit(void) | |||
| 169 | */ | 174 | */ |
| 170 | struct rds_transport rds_loop_transport = { | 175 | struct rds_transport rds_loop_transport = { |
| 171 | .xmit = rds_loop_xmit, | 176 | .xmit = rds_loop_xmit, |
| 172 | .xmit_cong_map = rds_loop_xmit_cong_map, | ||
| 173 | .recv = rds_loop_recv, | 177 | .recv = rds_loop_recv, |
| 174 | .conn_alloc = rds_loop_conn_alloc, | 178 | .conn_alloc = rds_loop_conn_alloc, |
| 175 | .conn_free = rds_loop_conn_free, | 179 | .conn_free = rds_loop_conn_free, |
| 176 | .conn_connect = rds_loop_conn_connect, | 180 | .conn_connect = rds_loop_conn_connect, |
| 177 | .conn_shutdown = rds_loop_conn_shutdown, | 181 | .conn_shutdown = rds_loop_conn_shutdown, |
| 178 | .inc_copy_to_user = rds_message_inc_copy_to_user, | 182 | .inc_copy_to_user = rds_message_inc_copy_to_user, |
| 179 | .inc_purge = rds_message_inc_purge, | 183 | .inc_free = rds_loop_inc_free, |
| 180 | .inc_free = rds_message_inc_free, | ||
| 181 | .t_name = "loopback", | 184 | .t_name = "loopback", |
| 182 | }; | 185 | }; |
diff --git a/net/rds/message.c b/net/rds/message.c index 9a1d67e001ba..84f937f11d47 100644 --- a/net/rds/message.c +++ b/net/rds/message.c | |||
| @@ -34,9 +34,6 @@ | |||
| 34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
| 35 | 35 | ||
| 36 | #include "rds.h" | 36 | #include "rds.h" |
| 37 | #include "rdma.h" | ||
| 38 | |||
| 39 | static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); | ||
| 40 | 37 | ||
| 41 | static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { | 38 | static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { |
| 42 | [RDS_EXTHDR_NONE] = 0, | 39 | [RDS_EXTHDR_NONE] = 0, |
| @@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm) | |||
| 63 | if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) | 60 | if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) |
| 64 | return; | 61 | return; |
| 65 | 62 | ||
| 66 | for (i = 0; i < rm->m_nents; i++) { | 63 | for (i = 0; i < rm->data.op_nents; i++) { |
| 67 | rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); | 64 | rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); |
| 68 | /* XXX will have to put_page for page refs */ | 65 | /* XXX will have to put_page for page refs */ |
| 69 | __free_page(sg_page(&rm->m_sg[i])); | 66 | __free_page(sg_page(&rm->data.op_sg[i])); |
| 70 | } | 67 | } |
| 71 | rm->m_nents = 0; | 68 | rm->data.op_nents = 0; |
| 72 | 69 | ||
| 73 | if (rm->m_rdma_op) | 70 | if (rm->rdma.op_active) |
| 74 | rds_rdma_free_op(rm->m_rdma_op); | 71 | rds_rdma_free_op(&rm->rdma); |
| 75 | if (rm->m_rdma_mr) | 72 | if (rm->rdma.op_rdma_mr) |
| 76 | rds_mr_put(rm->m_rdma_mr); | 73 | rds_mr_put(rm->rdma.op_rdma_mr); |
| 77 | } | ||
| 78 | 74 | ||
| 79 | void rds_message_inc_purge(struct rds_incoming *inc) | 75 | if (rm->atomic.op_active) |
| 80 | { | 76 | rds_atomic_free_op(&rm->atomic); |
| 81 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | 77 | if (rm->atomic.op_rdma_mr) |
| 82 | rds_message_purge(rm); | 78 | rds_mr_put(rm->atomic.op_rdma_mr); |
| 83 | } | 79 | } |
| 84 | 80 | ||
| 85 | void rds_message_put(struct rds_message *rm) | 81 | void rds_message_put(struct rds_message *rm) |
| 86 | { | 82 | { |
| 87 | rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); | 83 | rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); |
| 88 | 84 | if (atomic_read(&rm->m_refcount) == 0) { | |
| 85 | printk(KERN_CRIT "danger refcount zero on %p\n", rm); | ||
| 86 | WARN_ON(1); | ||
| 87 | } | ||
| 89 | if (atomic_dec_and_test(&rm->m_refcount)) { | 88 | if (atomic_dec_and_test(&rm->m_refcount)) { |
| 90 | BUG_ON(!list_empty(&rm->m_sock_item)); | 89 | BUG_ON(!list_empty(&rm->m_sock_item)); |
| 91 | BUG_ON(!list_empty(&rm->m_conn_item)); | 90 | BUG_ON(!list_empty(&rm->m_conn_item)); |
| @@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm) | |||
| 96 | } | 95 | } |
| 97 | EXPORT_SYMBOL_GPL(rds_message_put); | 96 | EXPORT_SYMBOL_GPL(rds_message_put); |
| 98 | 97 | ||
| 99 | void rds_message_inc_free(struct rds_incoming *inc) | ||
| 100 | { | ||
| 101 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | ||
| 102 | rds_message_put(rm); | ||
| 103 | } | ||
| 104 | |||
| 105 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | 98 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, |
| 106 | __be16 dport, u64 seq) | 99 | __be16 dport, u64 seq) |
| 107 | { | 100 | { |
| @@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o | |||
| 214 | } | 207 | } |
| 215 | EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); | 208 | EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); |
| 216 | 209 | ||
| 217 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) | 210 | /* |
| 211 | * Each rds_message is allocated with extra space for the scatterlist entries | ||
| 212 | * rds ops will need. This is to minimize memory allocation count. Then, each rds op | ||
| 213 | * can grab SGs when initializing its part of the rds_message. | ||
| 214 | */ | ||
| 215 | struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) | ||
| 218 | { | 216 | { |
| 219 | struct rds_message *rm; | 217 | struct rds_message *rm; |
| 220 | 218 | ||
| 221 | rm = kzalloc(sizeof(struct rds_message) + | 219 | rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); |
| 222 | (nents * sizeof(struct scatterlist)), gfp); | ||
| 223 | if (!rm) | 220 | if (!rm) |
| 224 | goto out; | 221 | goto out; |
| 225 | 222 | ||
| 226 | if (nents) | 223 | rm->m_used_sgs = 0; |
| 227 | sg_init_table(rm->m_sg, nents); | 224 | rm->m_total_sgs = extra_len / sizeof(struct scatterlist); |
| 225 | |||
| 228 | atomic_set(&rm->m_refcount, 1); | 226 | atomic_set(&rm->m_refcount, 1); |
| 229 | INIT_LIST_HEAD(&rm->m_sock_item); | 227 | INIT_LIST_HEAD(&rm->m_sock_item); |
| 230 | INIT_LIST_HEAD(&rm->m_conn_item); | 228 | INIT_LIST_HEAD(&rm->m_conn_item); |
| 231 | spin_lock_init(&rm->m_rs_lock); | 229 | spin_lock_init(&rm->m_rs_lock); |
| 230 | init_waitqueue_head(&rm->m_flush_wait); | ||
| 232 | 231 | ||
| 233 | out: | 232 | out: |
| 234 | return rm; | 233 | return rm; |
| 235 | } | 234 | } |
| 236 | 235 | ||
| 236 | /* | ||
| 237 | * RDS ops use this to grab SG entries from the rm's sg pool. | ||
| 238 | */ | ||
| 239 | struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) | ||
| 240 | { | ||
| 241 | struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; | ||
| 242 | struct scatterlist *sg_ret; | ||
| 243 | |||
| 244 | WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); | ||
| 245 | WARN_ON(!nents); | ||
| 246 | |||
| 247 | sg_ret = &sg_first[rm->m_used_sgs]; | ||
| 248 | sg_init_table(sg_ret, nents); | ||
| 249 | rm->m_used_sgs += nents; | ||
| 250 | |||
| 251 | return sg_ret; | ||
| 252 | } | ||
| 253 | |||
| 237 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) | 254 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) |
| 238 | { | 255 | { |
| 239 | struct rds_message *rm; | 256 | struct rds_message *rm; |
| 240 | unsigned int i; | 257 | unsigned int i; |
| 258 | int num_sgs = ceil(total_len, PAGE_SIZE); | ||
| 259 | int extra_bytes = num_sgs * sizeof(struct scatterlist); | ||
| 241 | 260 | ||
| 242 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | 261 | rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); |
| 243 | if (rm == NULL) | 262 | if (!rm) |
| 244 | return ERR_PTR(-ENOMEM); | 263 | return ERR_PTR(-ENOMEM); |
| 245 | 264 | ||
| 246 | set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); | 265 | set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); |
| 247 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | 266 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); |
| 248 | rm->m_nents = ceil(total_len, PAGE_SIZE); | 267 | rm->data.op_nents = ceil(total_len, PAGE_SIZE); |
| 268 | rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); | ||
| 249 | 269 | ||
| 250 | for (i = 0; i < rm->m_nents; ++i) { | 270 | for (i = 0; i < rm->data.op_nents; ++i) { |
| 251 | sg_set_page(&rm->m_sg[i], | 271 | sg_set_page(&rm->data.op_sg[i], |
| 252 | virt_to_page(page_addrs[i]), | 272 | virt_to_page(page_addrs[i]), |
| 253 | PAGE_SIZE, 0); | 273 | PAGE_SIZE, 0); |
| 254 | } | 274 | } |
| @@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in | |||
| 256 | return rm; | 276 | return rm; |
| 257 | } | 277 | } |
| 258 | 278 | ||
| 259 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | 279 | int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, |
| 260 | size_t total_len) | 280 | size_t total_len) |
| 261 | { | 281 | { |
| 262 | unsigned long to_copy; | 282 | unsigned long to_copy; |
| 263 | unsigned long iov_off; | 283 | unsigned long iov_off; |
| 264 | unsigned long sg_off; | 284 | unsigned long sg_off; |
| 265 | struct rds_message *rm; | ||
| 266 | struct iovec *iov; | 285 | struct iovec *iov; |
| 267 | struct scatterlist *sg; | 286 | struct scatterlist *sg; |
| 268 | int ret; | 287 | int ret = 0; |
| 269 | |||
| 270 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | ||
| 271 | if (rm == NULL) { | ||
| 272 | ret = -ENOMEM; | ||
| 273 | goto out; | ||
| 274 | } | ||
| 275 | 288 | ||
| 276 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | 289 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); |
| 277 | 290 | ||
| 278 | /* | 291 | /* |
| 279 | * now allocate and copy in the data payload. | 292 | * now allocate and copy in the data payload. |
| 280 | */ | 293 | */ |
| 281 | sg = rm->m_sg; | 294 | sg = rm->data.op_sg; |
| 282 | iov = first_iov; | 295 | iov = first_iov; |
| 283 | iov_off = 0; | 296 | iov_off = 0; |
| 284 | sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ | 297 | sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ |
| 285 | 298 | ||
| 286 | while (total_len) { | 299 | while (total_len) { |
| 287 | if (sg_page(sg) == NULL) { | 300 | if (!sg_page(sg)) { |
| 288 | ret = rds_page_remainder_alloc(sg, total_len, | 301 | ret = rds_page_remainder_alloc(sg, total_len, |
| 289 | GFP_HIGHUSER); | 302 | GFP_HIGHUSER); |
| 290 | if (ret) | 303 | if (ret) |
| 291 | goto out; | 304 | goto out; |
| 292 | rm->m_nents++; | 305 | rm->data.op_nents++; |
| 293 | sg_off = 0; | 306 | sg_off = 0; |
| 294 | } | 307 | } |
| 295 | 308 | ||
| @@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | |||
| 320 | sg++; | 333 | sg++; |
| 321 | } | 334 | } |
| 322 | 335 | ||
| 323 | ret = 0; | ||
| 324 | out: | 336 | out: |
| 325 | if (ret) { | 337 | return ret; |
| 326 | if (rm) | ||
| 327 | rds_message_put(rm); | ||
| 328 | rm = ERR_PTR(ret); | ||
| 329 | } | ||
| 330 | return rm; | ||
| 331 | } | 338 | } |
| 332 | 339 | ||
| 333 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | 340 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, |
| @@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, | |||
| 348 | 355 | ||
| 349 | iov = first_iov; | 356 | iov = first_iov; |
| 350 | iov_off = 0; | 357 | iov_off = 0; |
| 351 | sg = rm->m_sg; | 358 | sg = rm->data.op_sg; |
| 352 | vec_off = 0; | 359 | vec_off = 0; |
| 353 | copied = 0; | 360 | copied = 0; |
| 354 | 361 | ||
| @@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, | |||
| 394 | */ | 401 | */ |
| 395 | void rds_message_wait(struct rds_message *rm) | 402 | void rds_message_wait(struct rds_message *rm) |
| 396 | { | 403 | { |
| 397 | wait_event(rds_message_flush_waitq, | 404 | wait_event_interruptible(rm->m_flush_wait, |
| 398 | !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); | 405 | !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); |
| 399 | } | 406 | } |
| 400 | 407 | ||
| 401 | void rds_message_unmapped(struct rds_message *rm) | 408 | void rds_message_unmapped(struct rds_message *rm) |
| 402 | { | 409 | { |
| 403 | clear_bit(RDS_MSG_MAPPED, &rm->m_flags); | 410 | clear_bit(RDS_MSG_MAPPED, &rm->m_flags); |
| 404 | if (waitqueue_active(&rds_message_flush_waitq)) | 411 | wake_up_interruptible(&rm->m_flush_wait); |
| 405 | wake_up(&rds_message_flush_waitq); | ||
| 406 | } | 412 | } |
| 407 | EXPORT_SYMBOL_GPL(rds_message_unmapped); | 413 | EXPORT_SYMBOL_GPL(rds_message_unmapped); |
| 408 | 414 | ||
diff --git a/net/rds/page.c b/net/rds/page.c index 595a952d4b17..5e44f5ae7898 100644 --- a/net/rds/page.c +++ b/net/rds/page.c | |||
| @@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | |||
| 116 | /* jump straight to allocation if we're trying for a huge page */ | 116 | /* jump straight to allocation if we're trying for a huge page */ |
| 117 | if (bytes >= PAGE_SIZE) { | 117 | if (bytes >= PAGE_SIZE) { |
| 118 | page = alloc_page(gfp); | 118 | page = alloc_page(gfp); |
| 119 | if (page == NULL) { | 119 | if (!page) { |
| 120 | ret = -ENOMEM; | 120 | ret = -ENOMEM; |
| 121 | } else { | 121 | } else { |
| 122 | sg_set_page(scat, page, PAGE_SIZE, 0); | 122 | sg_set_page(scat, page, PAGE_SIZE, 0); |
| @@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | |||
| 162 | rem = &per_cpu(rds_page_remainders, get_cpu()); | 162 | rem = &per_cpu(rds_page_remainders, get_cpu()); |
| 163 | local_irq_save(flags); | 163 | local_irq_save(flags); |
| 164 | 164 | ||
| 165 | if (page == NULL) { | 165 | if (!page) { |
| 166 | ret = -ENOMEM; | 166 | ret = -ENOMEM; |
| 167 | break; | 167 | break; |
| 168 | } | 168 | } |
| @@ -186,6 +186,7 @@ out: | |||
| 186 | ret ? 0 : scat->length); | 186 | ret ? 0 : scat->length); |
| 187 | return ret; | 187 | return ret; |
| 188 | } | 188 | } |
| 189 | EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); | ||
| 189 | 190 | ||
| 190 | static int rds_page_remainder_cpu_notify(struct notifier_block *self, | 191 | static int rds_page_remainder_cpu_notify(struct notifier_block *self, |
| 191 | unsigned long action, void *hcpu) | 192 | unsigned long action, void *hcpu) |
diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 75fd13bb631b..48064673fc76 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c | |||
| @@ -35,7 +35,7 @@ | |||
| 35 | #include <linux/rbtree.h> | 35 | #include <linux/rbtree.h> |
| 36 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ | 36 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ |
| 37 | 37 | ||
| 38 | #include "rdma.h" | 38 | #include "rds.h" |
| 39 | 39 | ||
| 40 | /* | 40 | /* |
| 41 | * XXX | 41 | * XXX |
| @@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) | |||
| 130 | { | 130 | { |
| 131 | struct rds_mr *mr; | 131 | struct rds_mr *mr; |
| 132 | struct rb_node *node; | 132 | struct rb_node *node; |
| 133 | unsigned long flags; | ||
| 133 | 134 | ||
| 134 | /* Release any MRs associated with this socket */ | 135 | /* Release any MRs associated with this socket */ |
| 136 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
| 135 | while ((node = rb_first(&rs->rs_rdma_keys))) { | 137 | while ((node = rb_first(&rs->rs_rdma_keys))) { |
| 136 | mr = container_of(node, struct rds_mr, r_rb_node); | 138 | mr = container_of(node, struct rds_mr, r_rb_node); |
| 137 | if (mr->r_trans == rs->rs_transport) | 139 | if (mr->r_trans == rs->rs_transport) |
| 138 | mr->r_invalidate = 0; | 140 | mr->r_invalidate = 0; |
| 141 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | ||
| 142 | RB_CLEAR_NODE(&mr->r_rb_node); | ||
| 143 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
| 144 | rds_destroy_mr(mr); | ||
| 139 | rds_mr_put(mr); | 145 | rds_mr_put(mr); |
| 146 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
| 140 | } | 147 | } |
| 148 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
| 141 | 149 | ||
| 142 | if (rs->rs_transport && rs->rs_transport->flush_mrs) | 150 | if (rs->rs_transport && rs->rs_transport->flush_mrs) |
| 143 | rs->rs_transport->flush_mrs(); | 151 | rs->rs_transport->flush_mrs(); |
| @@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
| 181 | goto out; | 189 | goto out; |
| 182 | } | 190 | } |
| 183 | 191 | ||
| 184 | if (rs->rs_transport->get_mr == NULL) { | 192 | if (!rs->rs_transport->get_mr) { |
| 185 | ret = -EOPNOTSUPP; | 193 | ret = -EOPNOTSUPP; |
| 186 | goto out; | 194 | goto out; |
| 187 | } | 195 | } |
| @@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
| 197 | 205 | ||
| 198 | /* XXX clamp nr_pages to limit the size of this alloc? */ | 206 | /* XXX clamp nr_pages to limit the size of this alloc? */ |
| 199 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 207 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
| 200 | if (pages == NULL) { | 208 | if (!pages) { |
| 201 | ret = -ENOMEM; | 209 | ret = -ENOMEM; |
| 202 | goto out; | 210 | goto out; |
| 203 | } | 211 | } |
| 204 | 212 | ||
| 205 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); | 213 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); |
| 206 | if (mr == NULL) { | 214 | if (!mr) { |
| 207 | ret = -ENOMEM; | 215 | ret = -ENOMEM; |
| 208 | goto out; | 216 | goto out; |
| 209 | } | 217 | } |
| @@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
| 230 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to | 238 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to |
| 231 | * the zero page. | 239 | * the zero page. |
| 232 | */ | 240 | */ |
| 233 | ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); | 241 | ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); |
| 234 | if (ret < 0) | 242 | if (ret < 0) |
| 235 | goto out; | 243 | goto out; |
| 236 | 244 | ||
| 237 | nents = ret; | 245 | nents = ret; |
| 238 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); | 246 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); |
| 239 | if (sg == NULL) { | 247 | if (!sg) { |
| 240 | ret = -ENOMEM; | 248 | ret = -ENOMEM; |
| 241 | goto out; | 249 | goto out; |
| 242 | } | 250 | } |
| @@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) | |||
| 406 | 414 | ||
| 407 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | 415 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); |
| 408 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | 416 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); |
| 409 | if (mr && (mr->r_use_once || force)) { | 417 | if (!mr) { |
| 418 | printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); | ||
| 419 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
| 420 | return; | ||
| 421 | } | ||
| 422 | |||
| 423 | if (mr->r_use_once || force) { | ||
| 410 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | 424 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); |
| 411 | RB_CLEAR_NODE(&mr->r_rb_node); | 425 | RB_CLEAR_NODE(&mr->r_rb_node); |
| 412 | zot_me = 1; | 426 | zot_me = 1; |
| 413 | } else if (mr) | 427 | } |
| 414 | atomic_inc(&mr->r_refcount); | ||
| 415 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | 428 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); |
| 416 | 429 | ||
| 417 | /* May have to issue a dma_sync on this memory region. | 430 | /* May have to issue a dma_sync on this memory region. |
| 418 | * Note we could avoid this if the operation was a RDMA READ, | 431 | * Note we could avoid this if the operation was a RDMA READ, |
| 419 | * but at this point we can't tell. */ | 432 | * but at this point we can't tell. */ |
| 420 | if (mr != NULL) { | 433 | if (mr->r_trans->sync_mr) |
| 421 | if (mr->r_trans->sync_mr) | 434 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); |
| 422 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); | 435 | |
| 423 | 436 | /* If the MR was marked as invalidate, this will | |
| 424 | /* If the MR was marked as invalidate, this will | 437 | * trigger an async flush. */ |
| 425 | * trigger an async flush. */ | 438 | if (zot_me) |
| 426 | if (zot_me) | 439 | rds_destroy_mr(mr); |
| 427 | rds_destroy_mr(mr); | 440 | rds_mr_put(mr); |
| 428 | rds_mr_put(mr); | ||
| 429 | } | ||
| 430 | } | 441 | } |
| 431 | 442 | ||
| 432 | void rds_rdma_free_op(struct rds_rdma_op *ro) | 443 | void rds_rdma_free_op(struct rm_rdma_op *ro) |
| 433 | { | 444 | { |
| 434 | unsigned int i; | 445 | unsigned int i; |
| 435 | 446 | ||
| 436 | for (i = 0; i < ro->r_nents; i++) { | 447 | for (i = 0; i < ro->op_nents; i++) { |
| 437 | struct page *page = sg_page(&ro->r_sg[i]); | 448 | struct page *page = sg_page(&ro->op_sg[i]); |
| 438 | 449 | ||
| 439 | /* Mark page dirty if it was possibly modified, which | 450 | /* Mark page dirty if it was possibly modified, which |
| 440 | * is the case for a RDMA_READ which copies from remote | 451 | * is the case for a RDMA_READ which copies from remote |
| 441 | * to local memory */ | 452 | * to local memory */ |
| 442 | if (!ro->r_write) { | 453 | if (!ro->op_write) { |
| 443 | BUG_ON(in_interrupt()); | 454 | BUG_ON(irqs_disabled()); |
| 444 | set_page_dirty(page); | 455 | set_page_dirty(page); |
| 445 | } | 456 | } |
| 446 | put_page(page); | 457 | put_page(page); |
| 447 | } | 458 | } |
| 448 | 459 | ||
| 449 | kfree(ro->r_notifier); | 460 | kfree(ro->op_notifier); |
| 450 | kfree(ro); | 461 | ro->op_notifier = NULL; |
| 462 | ro->op_active = 0; | ||
| 463 | } | ||
| 464 | |||
| 465 | void rds_atomic_free_op(struct rm_atomic_op *ao) | ||
| 466 | { | ||
| 467 | struct page *page = sg_page(ao->op_sg); | ||
| 468 | |||
| 469 | /* Mark page dirty if it was possibly modified, which | ||
| 470 | * is the case for a RDMA_READ which copies from remote | ||
| 471 | * to local memory */ | ||
| 472 | set_page_dirty(page); | ||
| 473 | put_page(page); | ||
| 474 | |||
| 475 | kfree(ao->op_notifier); | ||
| 476 | ao->op_notifier = NULL; | ||
| 477 | ao->op_active = 0; | ||
| 478 | } | ||
| 479 | |||
| 480 | |||
| 481 | /* | ||
| 482 | * Count the number of pages needed to describe an incoming iovec. | ||
| 483 | */ | ||
| 484 | static int rds_rdma_pages(struct rds_rdma_args *args) | ||
| 485 | { | ||
| 486 | struct rds_iovec vec; | ||
| 487 | struct rds_iovec __user *local_vec; | ||
| 488 | unsigned int tot_pages = 0; | ||
| 489 | unsigned int nr_pages; | ||
| 490 | unsigned int i; | ||
| 491 | |||
| 492 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
| 493 | |||
| 494 | /* figure out the number of pages in the vector */ | ||
| 495 | for (i = 0; i < args->nr_local; i++) { | ||
| 496 | if (copy_from_user(&vec, &local_vec[i], | ||
| 497 | sizeof(struct rds_iovec))) | ||
| 498 | return -EFAULT; | ||
| 499 | |||
| 500 | nr_pages = rds_pages_in_vec(&vec); | ||
| 501 | if (nr_pages == 0) | ||
| 502 | return -EINVAL; | ||
| 503 | |||
| 504 | tot_pages += nr_pages; | ||
| 505 | } | ||
| 506 | |||
| 507 | return tot_pages; | ||
| 508 | } | ||
| 509 | |||
| 510 | int rds_rdma_extra_size(struct rds_rdma_args *args) | ||
| 511 | { | ||
| 512 | return rds_rdma_pages(args) * sizeof(struct scatterlist); | ||
| 451 | } | 513 | } |
| 452 | 514 | ||
| 453 | /* | 515 | /* |
| 454 | * args is a pointer to an in-kernel copy in the sendmsg cmsg. | 516 | * The application asks for a RDMA transfer. |
| 517 | * Extract all arguments and set up the rdma_op | ||
| 455 | */ | 518 | */ |
| 456 | static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | 519 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, |
| 457 | struct rds_rdma_args *args) | 520 | struct cmsghdr *cmsg) |
| 458 | { | 521 | { |
| 522 | struct rds_rdma_args *args; | ||
| 459 | struct rds_iovec vec; | 523 | struct rds_iovec vec; |
| 460 | struct rds_rdma_op *op = NULL; | 524 | struct rm_rdma_op *op = &rm->rdma; |
| 461 | unsigned int nr_pages; | 525 | unsigned int nr_pages; |
| 462 | unsigned int max_pages; | ||
| 463 | unsigned int nr_bytes; | 526 | unsigned int nr_bytes; |
| 464 | struct page **pages = NULL; | 527 | struct page **pages = NULL; |
| 465 | struct rds_iovec __user *local_vec; | 528 | struct rds_iovec __user *local_vec; |
| 466 | struct scatterlist *sg; | ||
| 467 | unsigned int nr; | 529 | unsigned int nr; |
| 468 | unsigned int i, j; | 530 | unsigned int i, j; |
| 469 | int ret; | 531 | int ret = 0; |
| 532 | |||
| 533 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) | ||
| 534 | || rm->rdma.op_active) | ||
| 535 | return -EINVAL; | ||
| 470 | 536 | ||
| 537 | args = CMSG_DATA(cmsg); | ||
| 471 | 538 | ||
| 472 | if (rs->rs_bound_addr == 0) { | 539 | if (rs->rs_bound_addr == 0) { |
| 473 | ret = -ENOTCONN; /* XXX not a great errno */ | 540 | ret = -ENOTCONN; /* XXX not a great errno */ |
| @@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
| 479 | goto out; | 546 | goto out; |
| 480 | } | 547 | } |
| 481 | 548 | ||
| 482 | nr_pages = 0; | 549 | nr_pages = rds_rdma_pages(args); |
| 483 | max_pages = 0; | 550 | if (nr_pages < 0) |
| 484 | |||
| 485 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
| 486 | |||
| 487 | /* figure out the number of pages in the vector */ | ||
| 488 | for (i = 0; i < args->nr_local; i++) { | ||
| 489 | if (copy_from_user(&vec, &local_vec[i], | ||
| 490 | sizeof(struct rds_iovec))) { | ||
| 491 | ret = -EFAULT; | ||
| 492 | goto out; | ||
| 493 | } | ||
| 494 | |||
| 495 | nr = rds_pages_in_vec(&vec); | ||
| 496 | if (nr == 0) { | ||
| 497 | ret = -EINVAL; | ||
| 498 | goto out; | ||
| 499 | } | ||
| 500 | |||
| 501 | max_pages = max(nr, max_pages); | ||
| 502 | nr_pages += nr; | ||
| 503 | } | ||
| 504 | |||
| 505 | pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); | ||
| 506 | if (pages == NULL) { | ||
| 507 | ret = -ENOMEM; | ||
| 508 | goto out; | 551 | goto out; |
| 509 | } | ||
| 510 | 552 | ||
| 511 | op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); | 553 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
| 512 | if (op == NULL) { | 554 | if (!pages) { |
| 513 | ret = -ENOMEM; | 555 | ret = -ENOMEM; |
| 514 | goto out; | 556 | goto out; |
| 515 | } | 557 | } |
| 516 | 558 | ||
| 517 | op->r_write = !!(args->flags & RDS_RDMA_READWRITE); | 559 | op->op_write = !!(args->flags & RDS_RDMA_READWRITE); |
| 518 | op->r_fence = !!(args->flags & RDS_RDMA_FENCE); | 560 | op->op_fence = !!(args->flags & RDS_RDMA_FENCE); |
| 519 | op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | 561 | op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); |
| 520 | op->r_recverr = rs->rs_recverr; | 562 | op->op_silent = !!(args->flags & RDS_RDMA_SILENT); |
| 563 | op->op_active = 1; | ||
| 564 | op->op_recverr = rs->rs_recverr; | ||
| 521 | WARN_ON(!nr_pages); | 565 | WARN_ON(!nr_pages); |
| 522 | sg_init_table(op->r_sg, nr_pages); | 566 | op->op_sg = rds_message_alloc_sgs(rm, nr_pages); |
| 523 | 567 | ||
| 524 | if (op->r_notify || op->r_recverr) { | 568 | if (op->op_notify || op->op_recverr) { |
| 525 | /* We allocate an uninitialized notifier here, because | 569 | /* We allocate an uninitialized notifier here, because |
| 526 | * we don't want to do that in the completion handler. We | 570 | * we don't want to do that in the completion handler. We |
| 527 | * would have to use GFP_ATOMIC there, and don't want to deal | 571 | * would have to use GFP_ATOMIC there, and don't want to deal |
| 528 | * with failed allocations. | 572 | * with failed allocations. |
| 529 | */ | 573 | */ |
| 530 | op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); | 574 | op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); |
| 531 | if (!op->r_notifier) { | 575 | if (!op->op_notifier) { |
| 532 | ret = -ENOMEM; | 576 | ret = -ENOMEM; |
| 533 | goto out; | 577 | goto out; |
| 534 | } | 578 | } |
| 535 | op->r_notifier->n_user_token = args->user_token; | 579 | op->op_notifier->n_user_token = args->user_token; |
| 536 | op->r_notifier->n_status = RDS_RDMA_SUCCESS; | 580 | op->op_notifier->n_status = RDS_RDMA_SUCCESS; |
| 537 | } | 581 | } |
| 538 | 582 | ||
| 539 | /* The cookie contains the R_Key of the remote memory region, and | 583 | /* The cookie contains the R_Key of the remote memory region, and |
| @@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
| 543 | * destination address (which is really an offset into the MR) | 587 | * destination address (which is really an offset into the MR) |
| 544 | * FIXME: We may want to move this into ib_rdma.c | 588 | * FIXME: We may want to move this into ib_rdma.c |
| 545 | */ | 589 | */ |
| 546 | op->r_key = rds_rdma_cookie_key(args->cookie); | 590 | op->op_rkey = rds_rdma_cookie_key(args->cookie); |
| 547 | op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); | 591 | op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); |
| 548 | 592 | ||
| 549 | nr_bytes = 0; | 593 | nr_bytes = 0; |
| 550 | 594 | ||
| 551 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", | 595 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", |
| 552 | (unsigned long long)args->nr_local, | 596 | (unsigned long long)args->nr_local, |
| 553 | (unsigned long long)args->remote_vec.addr, | 597 | (unsigned long long)args->remote_vec.addr, |
| 554 | op->r_key); | 598 | op->op_rkey); |
| 599 | |||
| 600 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
| 555 | 601 | ||
| 556 | for (i = 0; i < args->nr_local; i++) { | 602 | for (i = 0; i < args->nr_local; i++) { |
| 557 | if (copy_from_user(&vec, &local_vec[i], | 603 | if (copy_from_user(&vec, &local_vec[i], |
| @@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
| 569 | rs->rs_user_addr = vec.addr; | 615 | rs->rs_user_addr = vec.addr; |
| 570 | rs->rs_user_bytes = vec.bytes; | 616 | rs->rs_user_bytes = vec.bytes; |
| 571 | 617 | ||
| 572 | /* did the user change the vec under us? */ | ||
| 573 | if (nr > max_pages || op->r_nents + nr > nr_pages) { | ||
| 574 | ret = -EINVAL; | ||
| 575 | goto out; | ||
| 576 | } | ||
| 577 | /* If it's a WRITE operation, we want to pin the pages for reading. | 618 | /* If it's a WRITE operation, we want to pin the pages for reading. |
| 578 | * If it's a READ operation, we need to pin the pages for writing. | 619 | * If it's a READ operation, we need to pin the pages for writing. |
| 579 | */ | 620 | */ |
| 580 | ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); | 621 | ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); |
| 581 | if (ret < 0) | 622 | if (ret < 0) |
| 582 | goto out; | 623 | goto out; |
| 583 | 624 | ||
| @@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
| 588 | 629 | ||
| 589 | for (j = 0; j < nr; j++) { | 630 | for (j = 0; j < nr; j++) { |
| 590 | unsigned int offset = vec.addr & ~PAGE_MASK; | 631 | unsigned int offset = vec.addr & ~PAGE_MASK; |
| 632 | struct scatterlist *sg; | ||
| 591 | 633 | ||
| 592 | sg = &op->r_sg[op->r_nents + j]; | 634 | sg = &op->op_sg[op->op_nents + j]; |
| 593 | sg_set_page(sg, pages[j], | 635 | sg_set_page(sg, pages[j], |
| 594 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), | 636 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), |
| 595 | offset); | 637 | offset); |
| @@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
| 601 | vec.bytes -= sg->length; | 643 | vec.bytes -= sg->length; |
| 602 | } | 644 | } |
| 603 | 645 | ||
| 604 | op->r_nents += nr; | 646 | op->op_nents += nr; |
| 605 | } | 647 | } |
| 606 | 648 | ||
| 607 | |||
| 608 | if (nr_bytes > args->remote_vec.bytes) { | 649 | if (nr_bytes > args->remote_vec.bytes) { |
| 609 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", | 650 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", |
| 610 | nr_bytes, | 651 | nr_bytes, |
| @@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
| 612 | ret = -EINVAL; | 653 | ret = -EINVAL; |
| 613 | goto out; | 654 | goto out; |
| 614 | } | 655 | } |
| 615 | op->r_bytes = nr_bytes; | 656 | op->op_bytes = nr_bytes; |
| 616 | 657 | ||
| 617 | ret = 0; | 658 | ret = 0; |
| 618 | out: | 659 | out: |
| 619 | kfree(pages); | 660 | kfree(pages); |
| 620 | if (ret) { | 661 | if (ret) |
| 621 | if (op) | 662 | rds_rdma_free_op(op); |
| 622 | rds_rdma_free_op(op); | ||
| 623 | op = ERR_PTR(ret); | ||
| 624 | } | ||
| 625 | return op; | ||
| 626 | } | ||
| 627 | |||
| 628 | /* | ||
| 629 | * The application asks for a RDMA transfer. | ||
| 630 | * Extract all arguments and set up the rdma_op | ||
| 631 | */ | ||
| 632 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
| 633 | struct cmsghdr *cmsg) | ||
| 634 | { | ||
| 635 | struct rds_rdma_op *op; | ||
| 636 | |||
| 637 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || | ||
| 638 | rm->m_rdma_op != NULL) | ||
| 639 | return -EINVAL; | ||
| 640 | 663 | ||
| 641 | op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); | ||
| 642 | if (IS_ERR(op)) | ||
| 643 | return PTR_ERR(op); | ||
| 644 | rds_stats_inc(s_send_rdma); | 664 | rds_stats_inc(s_send_rdma); |
| 645 | rm->m_rdma_op = op; | 665 | |
| 646 | return 0; | 666 | return ret; |
| 647 | } | 667 | } |
| 648 | 668 | ||
| 649 | /* | 669 | /* |
| @@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | |||
| 673 | 693 | ||
| 674 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | 694 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); |
| 675 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | 695 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); |
| 676 | if (mr == NULL) | 696 | if (!mr) |
| 677 | err = -EINVAL; /* invalid r_key */ | 697 | err = -EINVAL; /* invalid r_key */ |
| 678 | else | 698 | else |
| 679 | atomic_inc(&mr->r_refcount); | 699 | atomic_inc(&mr->r_refcount); |
| @@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | |||
| 681 | 701 | ||
| 682 | if (mr) { | 702 | if (mr) { |
| 683 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); | 703 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); |
| 684 | rm->m_rdma_mr = mr; | 704 | rm->rdma.op_rdma_mr = mr; |
| 685 | } | 705 | } |
| 686 | return err; | 706 | return err; |
| 687 | } | 707 | } |
| @@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | |||
| 699 | rm->m_rdma_cookie != 0) | 719 | rm->m_rdma_cookie != 0) |
| 700 | return -EINVAL; | 720 | return -EINVAL; |
| 701 | 721 | ||
| 702 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); | 722 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); |
| 723 | } | ||
| 724 | |||
| 725 | /* | ||
| 726 | * Fill in rds_message for an atomic request. | ||
| 727 | */ | ||
| 728 | int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, | ||
| 729 | struct cmsghdr *cmsg) | ||
| 730 | { | ||
| 731 | struct page *page = NULL; | ||
| 732 | struct rds_atomic_args *args; | ||
| 733 | int ret = 0; | ||
| 734 | |||
| 735 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) | ||
| 736 | || rm->atomic.op_active) | ||
| 737 | return -EINVAL; | ||
| 738 | |||
| 739 | args = CMSG_DATA(cmsg); | ||
| 740 | |||
| 741 | /* Nonmasked & masked cmsg ops converted to masked hw ops */ | ||
| 742 | switch (cmsg->cmsg_type) { | ||
| 743 | case RDS_CMSG_ATOMIC_FADD: | ||
| 744 | rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; | ||
| 745 | rm->atomic.op_m_fadd.add = args->fadd.add; | ||
| 746 | rm->atomic.op_m_fadd.nocarry_mask = 0; | ||
| 747 | break; | ||
| 748 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
| 749 | rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; | ||
| 750 | rm->atomic.op_m_fadd.add = args->m_fadd.add; | ||
| 751 | rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; | ||
| 752 | break; | ||
| 753 | case RDS_CMSG_ATOMIC_CSWP: | ||
| 754 | rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; | ||
| 755 | rm->atomic.op_m_cswp.compare = args->cswp.compare; | ||
| 756 | rm->atomic.op_m_cswp.swap = args->cswp.swap; | ||
| 757 | rm->atomic.op_m_cswp.compare_mask = ~0; | ||
| 758 | rm->atomic.op_m_cswp.swap_mask = ~0; | ||
| 759 | break; | ||
| 760 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
| 761 | rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; | ||
| 762 | rm->atomic.op_m_cswp.compare = args->m_cswp.compare; | ||
| 763 | rm->atomic.op_m_cswp.swap = args->m_cswp.swap; | ||
| 764 | rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; | ||
| 765 | rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; | ||
| 766 | break; | ||
| 767 | default: | ||
| 768 | BUG(); /* should never happen */ | ||
| 769 | } | ||
| 770 | |||
| 771 | rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | ||
| 772 | rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); | ||
| 773 | rm->atomic.op_active = 1; | ||
| 774 | rm->atomic.op_recverr = rs->rs_recverr; | ||
| 775 | rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); | ||
| 776 | |||
| 777 | /* verify 8 byte-aligned */ | ||
| 778 | if (args->local_addr & 0x7) { | ||
| 779 | ret = -EFAULT; | ||
| 780 | goto err; | ||
| 781 | } | ||
| 782 | |||
| 783 | ret = rds_pin_pages(args->local_addr, 1, &page, 1); | ||
| 784 | if (ret != 1) | ||
| 785 | goto err; | ||
| 786 | ret = 0; | ||
| 787 | |||
| 788 | sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); | ||
| 789 | |||
| 790 | if (rm->atomic.op_notify || rm->atomic.op_recverr) { | ||
| 791 | /* We allocate an uninitialized notifier here, because | ||
| 792 | * we don't want to do that in the completion handler. We | ||
| 793 | * would have to use GFP_ATOMIC there, and don't want to deal | ||
| 794 | * with failed allocations. | ||
| 795 | */ | ||
| 796 | rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); | ||
| 797 | if (!rm->atomic.op_notifier) { | ||
| 798 | ret = -ENOMEM; | ||
| 799 | goto err; | ||
| 800 | } | ||
| 801 | |||
| 802 | rm->atomic.op_notifier->n_user_token = args->user_token; | ||
| 803 | rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; | ||
| 804 | } | ||
| 805 | |||
| 806 | rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); | ||
| 807 | rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); | ||
| 808 | |||
| 809 | return ret; | ||
| 810 | err: | ||
| 811 | if (page) | ||
| 812 | put_page(page); | ||
| 813 | kfree(rm->atomic.op_notifier); | ||
| 814 | |||
| 815 | return ret; | ||
| 703 | } | 816 | } |
diff --git a/net/rds/rdma.h b/net/rds/rdma.h deleted file mode 100644 index 909c39835a5d..000000000000 --- a/net/rds/rdma.h +++ /dev/null | |||
| @@ -1,85 +0,0 @@ | |||
| 1 | #ifndef _RDS_RDMA_H | ||
| 2 | #define _RDS_RDMA_H | ||
| 3 | |||
| 4 | #include <linux/rbtree.h> | ||
| 5 | #include <linux/spinlock.h> | ||
| 6 | #include <linux/scatterlist.h> | ||
| 7 | |||
| 8 | #include "rds.h" | ||
| 9 | |||
| 10 | struct rds_mr { | ||
| 11 | struct rb_node r_rb_node; | ||
| 12 | atomic_t r_refcount; | ||
| 13 | u32 r_key; | ||
| 14 | |||
| 15 | /* A copy of the creation flags */ | ||
| 16 | unsigned int r_use_once:1; | ||
| 17 | unsigned int r_invalidate:1; | ||
| 18 | unsigned int r_write:1; | ||
| 19 | |||
| 20 | /* This is for RDS_MR_DEAD. | ||
| 21 | * It would be nice & consistent to make this part of the above | ||
| 22 | * bit field here, but we need to use test_and_set_bit. | ||
| 23 | */ | ||
| 24 | unsigned long r_state; | ||
| 25 | struct rds_sock *r_sock; /* back pointer to the socket that owns us */ | ||
| 26 | struct rds_transport *r_trans; | ||
| 27 | void *r_trans_private; | ||
| 28 | }; | ||
| 29 | |||
| 30 | /* Flags for mr->r_state */ | ||
| 31 | #define RDS_MR_DEAD 0 | ||
| 32 | |||
| 33 | struct rds_rdma_op { | ||
| 34 | u32 r_key; | ||
| 35 | u64 r_remote_addr; | ||
| 36 | unsigned int r_write:1; | ||
| 37 | unsigned int r_fence:1; | ||
| 38 | unsigned int r_notify:1; | ||
| 39 | unsigned int r_recverr:1; | ||
| 40 | unsigned int r_mapped:1; | ||
| 41 | struct rds_notifier *r_notifier; | ||
| 42 | unsigned int r_bytes; | ||
| 43 | unsigned int r_nents; | ||
| 44 | unsigned int r_count; | ||
| 45 | struct scatterlist r_sg[0]; | ||
| 46 | }; | ||
| 47 | |||
| 48 | static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) | ||
| 49 | { | ||
| 50 | return r_key | (((u64) offset) << 32); | ||
| 51 | } | ||
| 52 | |||
| 53 | static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) | ||
| 54 | { | ||
| 55 | return cookie; | ||
| 56 | } | ||
| 57 | |||
| 58 | static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) | ||
| 59 | { | ||
| 60 | return cookie >> 32; | ||
| 61 | } | ||
| 62 | |||
| 63 | int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
| 64 | int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); | ||
| 65 | int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
| 66 | void rds_rdma_drop_keys(struct rds_sock *rs); | ||
| 67 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
| 68 | struct cmsghdr *cmsg); | ||
| 69 | int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||
| 70 | struct cmsghdr *cmsg); | ||
| 71 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
| 72 | struct cmsghdr *cmsg); | ||
| 73 | int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | ||
| 74 | struct cmsghdr *cmsg); | ||
| 75 | void rds_rdma_free_op(struct rds_rdma_op *ro); | ||
| 76 | void rds_rdma_send_complete(struct rds_message *rm, int); | ||
| 77 | |||
| 78 | extern void __rds_put_mr_final(struct rds_mr *mr); | ||
| 79 | static inline void rds_mr_put(struct rds_mr *mr) | ||
| 80 | { | ||
| 81 | if (atomic_dec_and_test(&mr->r_refcount)) | ||
| 82 | __rds_put_mr_final(mr); | ||
| 83 | } | ||
| 84 | |||
| 85 | #endif | ||
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index e599ba2f950d..e6ed10aee190 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c | |||
| @@ -36,6 +36,34 @@ | |||
| 36 | 36 | ||
| 37 | static struct rdma_cm_id *rds_rdma_listen_id; | 37 | static struct rdma_cm_id *rds_rdma_listen_id; |
| 38 | 38 | ||
| 39 | static char *rds_cm_event_strings[] = { | ||
| 40 | #define RDS_CM_EVENT_STRING(foo) \ | ||
| 41 | [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) | ||
| 42 | RDS_CM_EVENT_STRING(ADDR_RESOLVED), | ||
| 43 | RDS_CM_EVENT_STRING(ADDR_ERROR), | ||
| 44 | RDS_CM_EVENT_STRING(ROUTE_RESOLVED), | ||
| 45 | RDS_CM_EVENT_STRING(ROUTE_ERROR), | ||
| 46 | RDS_CM_EVENT_STRING(CONNECT_REQUEST), | ||
| 47 | RDS_CM_EVENT_STRING(CONNECT_RESPONSE), | ||
| 48 | RDS_CM_EVENT_STRING(CONNECT_ERROR), | ||
| 49 | RDS_CM_EVENT_STRING(UNREACHABLE), | ||
| 50 | RDS_CM_EVENT_STRING(REJECTED), | ||
| 51 | RDS_CM_EVENT_STRING(ESTABLISHED), | ||
| 52 | RDS_CM_EVENT_STRING(DISCONNECTED), | ||
| 53 | RDS_CM_EVENT_STRING(DEVICE_REMOVAL), | ||
| 54 | RDS_CM_EVENT_STRING(MULTICAST_JOIN), | ||
| 55 | RDS_CM_EVENT_STRING(MULTICAST_ERROR), | ||
| 56 | RDS_CM_EVENT_STRING(ADDR_CHANGE), | ||
| 57 | RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), | ||
| 58 | #undef RDS_CM_EVENT_STRING | ||
| 59 | }; | ||
| 60 | |||
| 61 | static char *rds_cm_event_str(enum rdma_cm_event_type type) | ||
| 62 | { | ||
| 63 | return rds_str_array(rds_cm_event_strings, | ||
| 64 | ARRAY_SIZE(rds_cm_event_strings), type); | ||
| 65 | }; | ||
| 66 | |||
| 39 | int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | 67 | int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, |
| 40 | struct rdma_cm_event *event) | 68 | struct rdma_cm_event *event) |
| 41 | { | 69 | { |
| @@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | |||
| 44 | struct rds_transport *trans; | 72 | struct rds_transport *trans; |
| 45 | int ret = 0; | 73 | int ret = 0; |
| 46 | 74 | ||
| 47 | rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, | 75 | rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, |
| 48 | event->event); | 76 | event->event, rds_cm_event_str(event->event)); |
| 49 | 77 | ||
| 50 | if (cm_id->device->node_type == RDMA_NODE_RNIC) | 78 | if (cm_id->device->node_type == RDMA_NODE_RNIC) |
| 51 | trans = &rds_iw_transport; | 79 | trans = &rds_iw_transport; |
| @@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | |||
| 109 | 137 | ||
| 110 | default: | 138 | default: |
| 111 | /* things like device disconnect? */ | 139 | /* things like device disconnect? */ |
| 112 | printk(KERN_ERR "RDS: unknown event %u!\n", event->event); | 140 | printk(KERN_ERR "RDS: unknown event %u (%s)!\n", |
| 141 | event->event, rds_cm_event_str(event->event)); | ||
| 113 | break; | 142 | break; |
| 114 | } | 143 | } |
| 115 | 144 | ||
| @@ -117,12 +146,13 @@ out: | |||
| 117 | if (conn) | 146 | if (conn) |
| 118 | mutex_unlock(&conn->c_cm_lock); | 147 | mutex_unlock(&conn->c_cm_lock); |
| 119 | 148 | ||
| 120 | rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); | 149 | rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, |
| 150 | rds_cm_event_str(event->event), ret); | ||
| 121 | 151 | ||
| 122 | return ret; | 152 | return ret; |
| 123 | } | 153 | } |
| 124 | 154 | ||
| 125 | static int __init rds_rdma_listen_init(void) | 155 | static int rds_rdma_listen_init(void) |
| 126 | { | 156 | { |
| 127 | struct sockaddr_in sin; | 157 | struct sockaddr_in sin; |
| 128 | struct rdma_cm_id *cm_id; | 158 | struct rdma_cm_id *cm_id; |
| @@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void) | |||
| 177 | } | 207 | } |
| 178 | } | 208 | } |
| 179 | 209 | ||
| 180 | int __init rds_rdma_init(void) | 210 | int rds_rdma_init(void) |
| 181 | { | 211 | { |
| 182 | int ret; | 212 | int ret; |
| 183 | 213 | ||
diff --git a/net/rds/rds.h b/net/rds/rds.h index c224b5bb3ba9..8103dcf8b976 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h | |||
| @@ -80,6 +80,7 @@ enum { | |||
| 80 | /* Bits for c_flags */ | 80 | /* Bits for c_flags */ |
| 81 | #define RDS_LL_SEND_FULL 0 | 81 | #define RDS_LL_SEND_FULL 0 |
| 82 | #define RDS_RECONNECT_PENDING 1 | 82 | #define RDS_RECONNECT_PENDING 1 |
| 83 | #define RDS_IN_XMIT 2 | ||
| 83 | 84 | ||
| 84 | struct rds_connection { | 85 | struct rds_connection { |
| 85 | struct hlist_node c_hash_node; | 86 | struct hlist_node c_hash_node; |
| @@ -91,12 +92,13 @@ struct rds_connection { | |||
| 91 | struct rds_cong_map *c_lcong; | 92 | struct rds_cong_map *c_lcong; |
| 92 | struct rds_cong_map *c_fcong; | 93 | struct rds_cong_map *c_fcong; |
| 93 | 94 | ||
| 94 | struct mutex c_send_lock; /* protect send ring */ | ||
| 95 | struct rds_message *c_xmit_rm; | 95 | struct rds_message *c_xmit_rm; |
| 96 | unsigned long c_xmit_sg; | 96 | unsigned long c_xmit_sg; |
| 97 | unsigned int c_xmit_hdr_off; | 97 | unsigned int c_xmit_hdr_off; |
| 98 | unsigned int c_xmit_data_off; | 98 | unsigned int c_xmit_data_off; |
| 99 | unsigned int c_xmit_atomic_sent; | ||
| 99 | unsigned int c_xmit_rdma_sent; | 100 | unsigned int c_xmit_rdma_sent; |
| 101 | unsigned int c_xmit_data_sent; | ||
| 100 | 102 | ||
| 101 | spinlock_t c_lock; /* protect msg queues */ | 103 | spinlock_t c_lock; /* protect msg queues */ |
| 102 | u64 c_next_tx_seq; | 104 | u64 c_next_tx_seq; |
| @@ -116,11 +118,10 @@ struct rds_connection { | |||
| 116 | struct delayed_work c_conn_w; | 118 | struct delayed_work c_conn_w; |
| 117 | struct work_struct c_down_w; | 119 | struct work_struct c_down_w; |
| 118 | struct mutex c_cm_lock; /* protect conn state & cm */ | 120 | struct mutex c_cm_lock; /* protect conn state & cm */ |
| 121 | wait_queue_head_t c_waitq; | ||
| 119 | 122 | ||
| 120 | struct list_head c_map_item; | 123 | struct list_head c_map_item; |
| 121 | unsigned long c_map_queued; | 124 | unsigned long c_map_queued; |
| 122 | unsigned long c_map_offset; | ||
| 123 | unsigned long c_map_bytes; | ||
| 124 | 125 | ||
| 125 | unsigned int c_unacked_packets; | 126 | unsigned int c_unacked_packets; |
| 126 | unsigned int c_unacked_bytes; | 127 | unsigned int c_unacked_bytes; |
| @@ -206,6 +207,48 @@ struct rds_incoming { | |||
| 206 | rds_rdma_cookie_t i_rdma_cookie; | 207 | rds_rdma_cookie_t i_rdma_cookie; |
| 207 | }; | 208 | }; |
| 208 | 209 | ||
| 210 | struct rds_mr { | ||
| 211 | struct rb_node r_rb_node; | ||
| 212 | atomic_t r_refcount; | ||
| 213 | u32 r_key; | ||
| 214 | |||
| 215 | /* A copy of the creation flags */ | ||
| 216 | unsigned int r_use_once:1; | ||
| 217 | unsigned int r_invalidate:1; | ||
| 218 | unsigned int r_write:1; | ||
| 219 | |||
| 220 | /* This is for RDS_MR_DEAD. | ||
| 221 | * It would be nice & consistent to make this part of the above | ||
| 222 | * bit field here, but we need to use test_and_set_bit. | ||
| 223 | */ | ||
| 224 | unsigned long r_state; | ||
| 225 | struct rds_sock *r_sock; /* back pointer to the socket that owns us */ | ||
| 226 | struct rds_transport *r_trans; | ||
| 227 | void *r_trans_private; | ||
| 228 | }; | ||
| 229 | |||
| 230 | /* Flags for mr->r_state */ | ||
| 231 | #define RDS_MR_DEAD 0 | ||
| 232 | |||
| 233 | static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) | ||
| 234 | { | ||
| 235 | return r_key | (((u64) offset) << 32); | ||
| 236 | } | ||
| 237 | |||
| 238 | static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) | ||
| 239 | { | ||
| 240 | return cookie; | ||
| 241 | } | ||
| 242 | |||
| 243 | static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) | ||
| 244 | { | ||
| 245 | return cookie >> 32; | ||
| 246 | } | ||
| 247 | |||
| 248 | /* atomic operation types */ | ||
| 249 | #define RDS_ATOMIC_TYPE_CSWP 0 | ||
| 250 | #define RDS_ATOMIC_TYPE_FADD 1 | ||
| 251 | |||
| 209 | /* | 252 | /* |
| 210 | * m_sock_item and m_conn_item are on lists that are serialized under | 253 | * m_sock_item and m_conn_item are on lists that are serialized under |
| 211 | * conn->c_lock. m_sock_item has additional meaning in that once it is empty | 254 | * conn->c_lock. m_sock_item has additional meaning in that once it is empty |
| @@ -258,13 +301,71 @@ struct rds_message { | |||
| 258 | * -> rs->rs_lock | 301 | * -> rs->rs_lock |
| 259 | */ | 302 | */ |
| 260 | spinlock_t m_rs_lock; | 303 | spinlock_t m_rs_lock; |
| 304 | wait_queue_head_t m_flush_wait; | ||
| 305 | |||
| 261 | struct rds_sock *m_rs; | 306 | struct rds_sock *m_rs; |
| 262 | struct rds_rdma_op *m_rdma_op; | 307 | |
| 308 | /* cookie to send to remote, in rds header */ | ||
| 263 | rds_rdma_cookie_t m_rdma_cookie; | 309 | rds_rdma_cookie_t m_rdma_cookie; |
| 264 | struct rds_mr *m_rdma_mr; | 310 | |
| 265 | unsigned int m_nents; | 311 | unsigned int m_used_sgs; |
| 266 | unsigned int m_count; | 312 | unsigned int m_total_sgs; |
| 267 | struct scatterlist m_sg[0]; | 313 | |
| 314 | void *m_final_op; | ||
| 315 | |||
| 316 | struct { | ||
| 317 | struct rm_atomic_op { | ||
| 318 | int op_type; | ||
| 319 | union { | ||
| 320 | struct { | ||
| 321 | uint64_t compare; | ||
| 322 | uint64_t swap; | ||
| 323 | uint64_t compare_mask; | ||
| 324 | uint64_t swap_mask; | ||
| 325 | } op_m_cswp; | ||
| 326 | struct { | ||
| 327 | uint64_t add; | ||
| 328 | uint64_t nocarry_mask; | ||
| 329 | } op_m_fadd; | ||
| 330 | }; | ||
| 331 | |||
| 332 | u32 op_rkey; | ||
| 333 | u64 op_remote_addr; | ||
| 334 | unsigned int op_notify:1; | ||
| 335 | unsigned int op_recverr:1; | ||
| 336 | unsigned int op_mapped:1; | ||
| 337 | unsigned int op_silent:1; | ||
| 338 | unsigned int op_active:1; | ||
| 339 | struct scatterlist *op_sg; | ||
| 340 | struct rds_notifier *op_notifier; | ||
| 341 | |||
| 342 | struct rds_mr *op_rdma_mr; | ||
| 343 | } atomic; | ||
| 344 | struct rm_rdma_op { | ||
| 345 | u32 op_rkey; | ||
| 346 | u64 op_remote_addr; | ||
| 347 | unsigned int op_write:1; | ||
| 348 | unsigned int op_fence:1; | ||
| 349 | unsigned int op_notify:1; | ||
| 350 | unsigned int op_recverr:1; | ||
| 351 | unsigned int op_mapped:1; | ||
| 352 | unsigned int op_silent:1; | ||
| 353 | unsigned int op_active:1; | ||
| 354 | unsigned int op_bytes; | ||
| 355 | unsigned int op_nents; | ||
| 356 | unsigned int op_count; | ||
| 357 | struct scatterlist *op_sg; | ||
| 358 | struct rds_notifier *op_notifier; | ||
| 359 | |||
| 360 | struct rds_mr *op_rdma_mr; | ||
| 361 | } rdma; | ||
| 362 | struct rm_data_op { | ||
| 363 | unsigned int op_active:1; | ||
| 364 | unsigned int op_nents; | ||
| 365 | unsigned int op_count; | ||
| 366 | struct scatterlist *op_sg; | ||
| 367 | } data; | ||
| 368 | }; | ||
| 268 | }; | 369 | }; |
| 269 | 370 | ||
| 270 | /* | 371 | /* |
| @@ -305,10 +406,6 @@ struct rds_notifier { | |||
| 305 | * transport is responsible for other serialization, including | 406 | * transport is responsible for other serialization, including |
| 306 | * rds_recv_incoming(). This is called in process context but | 407 | * rds_recv_incoming(). This is called in process context but |
| 307 | * should try hard not to block. | 408 | * should try hard not to block. |
| 308 | * | ||
| 309 | * @xmit_cong_map: This asks the transport to send the local bitmap down the | ||
| 310 | * given connection. XXX get a better story about the bitmap | ||
| 311 | * flag and header. | ||
| 312 | */ | 409 | */ |
| 313 | 410 | ||
| 314 | #define RDS_TRANS_IB 0 | 411 | #define RDS_TRANS_IB 0 |
| @@ -332,13 +429,11 @@ struct rds_transport { | |||
| 332 | void (*xmit_complete)(struct rds_connection *conn); | 429 | void (*xmit_complete)(struct rds_connection *conn); |
| 333 | int (*xmit)(struct rds_connection *conn, struct rds_message *rm, | 430 | int (*xmit)(struct rds_connection *conn, struct rds_message *rm, |
| 334 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 431 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
| 335 | int (*xmit_cong_map)(struct rds_connection *conn, | 432 | int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); |
| 336 | struct rds_cong_map *map, unsigned long offset); | 433 | int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); |
| 337 | int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); | ||
| 338 | int (*recv)(struct rds_connection *conn); | 434 | int (*recv)(struct rds_connection *conn); |
| 339 | int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, | 435 | int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, |
| 340 | size_t size); | 436 | size_t size); |
| 341 | void (*inc_purge)(struct rds_incoming *inc); | ||
| 342 | void (*inc_free)(struct rds_incoming *inc); | 437 | void (*inc_free)(struct rds_incoming *inc); |
| 343 | 438 | ||
| 344 | int (*cm_handle_connect)(struct rdma_cm_id *cm_id, | 439 | int (*cm_handle_connect)(struct rdma_cm_id *cm_id, |
| @@ -367,17 +462,11 @@ struct rds_sock { | |||
| 367 | * bound_addr used for both incoming and outgoing, no INADDR_ANY | 462 | * bound_addr used for both incoming and outgoing, no INADDR_ANY |
| 368 | * support. | 463 | * support. |
| 369 | */ | 464 | */ |
| 370 | struct rb_node rs_bound_node; | 465 | struct hlist_node rs_bound_node; |
| 371 | __be32 rs_bound_addr; | 466 | __be32 rs_bound_addr; |
| 372 | __be32 rs_conn_addr; | 467 | __be32 rs_conn_addr; |
| 373 | __be16 rs_bound_port; | 468 | __be16 rs_bound_port; |
| 374 | __be16 rs_conn_port; | 469 | __be16 rs_conn_port; |
| 375 | |||
| 376 | /* | ||
| 377 | * This is only used to communicate the transport between bind and | ||
| 378 | * initiating connections. All other trans use is referenced through | ||
| 379 | * the connection. | ||
| 380 | */ | ||
| 381 | struct rds_transport *rs_transport; | 470 | struct rds_transport *rs_transport; |
| 382 | 471 | ||
| 383 | /* | 472 | /* |
| @@ -466,8 +555,8 @@ struct rds_statistics { | |||
| 466 | uint64_t s_recv_ping; | 555 | uint64_t s_recv_ping; |
| 467 | uint64_t s_send_queue_empty; | 556 | uint64_t s_send_queue_empty; |
| 468 | uint64_t s_send_queue_full; | 557 | uint64_t s_send_queue_full; |
| 469 | uint64_t s_send_sem_contention; | 558 | uint64_t s_send_lock_contention; |
| 470 | uint64_t s_send_sem_queue_raced; | 559 | uint64_t s_send_lock_queue_raced; |
| 471 | uint64_t s_send_immediate_retry; | 560 | uint64_t s_send_immediate_retry; |
| 472 | uint64_t s_send_delayed_retry; | 561 | uint64_t s_send_delayed_retry; |
| 473 | uint64_t s_send_drop_acked; | 562 | uint64_t s_send_drop_acked; |
| @@ -487,6 +576,7 @@ struct rds_statistics { | |||
| 487 | }; | 576 | }; |
| 488 | 577 | ||
| 489 | /* af_rds.c */ | 578 | /* af_rds.c */ |
| 579 | char *rds_str_array(char **array, size_t elements, size_t index); | ||
| 490 | void rds_sock_addref(struct rds_sock *rs); | 580 | void rds_sock_addref(struct rds_sock *rs); |
| 491 | void rds_sock_put(struct rds_sock *rs); | 581 | void rds_sock_put(struct rds_sock *rs); |
| 492 | void rds_wake_sk_sleep(struct rds_sock *rs); | 582 | void rds_wake_sk_sleep(struct rds_sock *rs); |
| @@ -521,15 +611,17 @@ void rds_cong_exit(void); | |||
| 521 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); | 611 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); |
| 522 | 612 | ||
| 523 | /* conn.c */ | 613 | /* conn.c */ |
| 524 | int __init rds_conn_init(void); | 614 | int rds_conn_init(void); |
| 525 | void rds_conn_exit(void); | 615 | void rds_conn_exit(void); |
| 526 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, | 616 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, |
| 527 | struct rds_transport *trans, gfp_t gfp); | 617 | struct rds_transport *trans, gfp_t gfp); |
| 528 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | 618 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, |
| 529 | struct rds_transport *trans, gfp_t gfp); | 619 | struct rds_transport *trans, gfp_t gfp); |
| 620 | void rds_conn_shutdown(struct rds_connection *conn); | ||
| 530 | void rds_conn_destroy(struct rds_connection *conn); | 621 | void rds_conn_destroy(struct rds_connection *conn); |
| 531 | void rds_conn_reset(struct rds_connection *conn); | 622 | void rds_conn_reset(struct rds_connection *conn); |
| 532 | void rds_conn_drop(struct rds_connection *conn); | 623 | void rds_conn_drop(struct rds_connection *conn); |
| 624 | void rds_conn_connect_if_down(struct rds_connection *conn); | ||
| 533 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, | 625 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, |
| 534 | struct rds_info_iterator *iter, | 626 | struct rds_info_iterator *iter, |
| 535 | struct rds_info_lengths *lens, | 627 | struct rds_info_lengths *lens, |
| @@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn) | |||
| 566 | 658 | ||
| 567 | /* message.c */ | 659 | /* message.c */ |
| 568 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); | 660 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); |
| 569 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | 661 | struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); |
| 662 | int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, | ||
| 570 | size_t total_len); | 663 | size_t total_len); |
| 571 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); | 664 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); |
| 572 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | 665 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, |
| @@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers | |||
| 580 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); | 673 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); |
| 581 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | 674 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, |
| 582 | struct iovec *first_iov, size_t size); | 675 | struct iovec *first_iov, size_t size); |
| 583 | void rds_message_inc_purge(struct rds_incoming *inc); | ||
| 584 | void rds_message_inc_free(struct rds_incoming *inc); | 676 | void rds_message_inc_free(struct rds_incoming *inc); |
| 585 | void rds_message_addref(struct rds_message *rm); | 677 | void rds_message_addref(struct rds_message *rm); |
| 586 | void rds_message_put(struct rds_message *rm); | 678 | void rds_message_put(struct rds_message *rm); |
| @@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); | |||
| 636 | typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); | 728 | typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); |
| 637 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, | 729 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, |
| 638 | is_acked_func is_acked); | 730 | is_acked_func is_acked); |
| 639 | int rds_send_acked_before(struct rds_connection *conn, u64 seq); | ||
| 640 | void rds_send_remove_from_sock(struct list_head *messages, int status); | 731 | void rds_send_remove_from_sock(struct list_head *messages, int status); |
| 641 | int rds_send_pong(struct rds_connection *conn, __be16 dport); | 732 | int rds_send_pong(struct rds_connection *conn, __be16 dport); |
| 642 | struct rds_message *rds_send_get_message(struct rds_connection *, | 733 | struct rds_message *rds_send_get_message(struct rds_connection *, |
| 643 | struct rds_rdma_op *); | 734 | struct rm_rdma_op *); |
| 644 | 735 | ||
| 645 | /* rdma.c */ | 736 | /* rdma.c */ |
| 646 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); | 737 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); |
| 738 | int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
| 739 | int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); | ||
| 740 | int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
| 741 | void rds_rdma_drop_keys(struct rds_sock *rs); | ||
| 742 | int rds_rdma_extra_size(struct rds_rdma_args *args); | ||
| 743 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
| 744 | struct cmsghdr *cmsg); | ||
| 745 | int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||
| 746 | struct cmsghdr *cmsg); | ||
| 747 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
| 748 | struct cmsghdr *cmsg); | ||
| 749 | int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | ||
| 750 | struct cmsghdr *cmsg); | ||
| 751 | void rds_rdma_free_op(struct rm_rdma_op *ro); | ||
| 752 | void rds_atomic_free_op(struct rm_atomic_op *ao); | ||
| 753 | void rds_rdma_send_complete(struct rds_message *rm, int wc_status); | ||
| 754 | void rds_atomic_send_complete(struct rds_message *rm, int wc_status); | ||
| 755 | int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, | ||
| 756 | struct cmsghdr *cmsg); | ||
| 757 | |||
| 758 | extern void __rds_put_mr_final(struct rds_mr *mr); | ||
| 759 | static inline void rds_mr_put(struct rds_mr *mr) | ||
| 760 | { | ||
| 761 | if (atomic_dec_and_test(&mr->r_refcount)) | ||
| 762 | __rds_put_mr_final(mr); | ||
| 763 | } | ||
| 647 | 764 | ||
| 648 | /* stats.c */ | 765 | /* stats.c */ |
| 649 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | 766 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); |
| @@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | |||
| 657 | put_cpu(); \ | 774 | put_cpu(); \ |
| 658 | } while (0) | 775 | } while (0) |
| 659 | #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) | 776 | #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) |
| 660 | int __init rds_stats_init(void); | 777 | int rds_stats_init(void); |
| 661 | void rds_stats_exit(void); | 778 | void rds_stats_exit(void); |
| 662 | void rds_stats_info_copy(struct rds_info_iterator *iter, | 779 | void rds_stats_info_copy(struct rds_info_iterator *iter, |
| 663 | uint64_t *values, const char *const *names, | 780 | uint64_t *values, const char *const *names, |
| 664 | size_t nr); | 781 | size_t nr); |
| 665 | 782 | ||
| 666 | /* sysctl.c */ | 783 | /* sysctl.c */ |
| 667 | int __init rds_sysctl_init(void); | 784 | int rds_sysctl_init(void); |
| 668 | void rds_sysctl_exit(void); | 785 | void rds_sysctl_exit(void); |
| 669 | extern unsigned long rds_sysctl_sndbuf_min; | 786 | extern unsigned long rds_sysctl_sndbuf_min; |
| 670 | extern unsigned long rds_sysctl_sndbuf_default; | 787 | extern unsigned long rds_sysctl_sndbuf_default; |
| @@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags; | |||
| 678 | extern unsigned int rds_sysctl_trace_level; | 795 | extern unsigned int rds_sysctl_trace_level; |
| 679 | 796 | ||
| 680 | /* threads.c */ | 797 | /* threads.c */ |
| 681 | int __init rds_threads_init(void); | 798 | int rds_threads_init(void); |
| 682 | void rds_threads_exit(void); | 799 | void rds_threads_exit(void); |
| 683 | extern struct workqueue_struct *rds_wq; | 800 | extern struct workqueue_struct *rds_wq; |
| 801 | void rds_queue_reconnect(struct rds_connection *conn); | ||
| 684 | void rds_connect_worker(struct work_struct *); | 802 | void rds_connect_worker(struct work_struct *); |
| 685 | void rds_shutdown_worker(struct work_struct *); | 803 | void rds_shutdown_worker(struct work_struct *); |
| 686 | void rds_send_worker(struct work_struct *); | 804 | void rds_send_worker(struct work_struct *); |
| @@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn); | |||
| 691 | int rds_trans_register(struct rds_transport *trans); | 809 | int rds_trans_register(struct rds_transport *trans); |
| 692 | void rds_trans_unregister(struct rds_transport *trans); | 810 | void rds_trans_unregister(struct rds_transport *trans); |
| 693 | struct rds_transport *rds_trans_get_preferred(__be32 addr); | 811 | struct rds_transport *rds_trans_get_preferred(__be32 addr); |
| 812 | void rds_trans_put(struct rds_transport *trans); | ||
| 694 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, | 813 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, |
| 695 | unsigned int avail); | 814 | unsigned int avail); |
| 696 | int __init rds_trans_init(void); | 815 | int rds_trans_init(void); |
| 697 | void rds_trans_exit(void); | 816 | void rds_trans_exit(void); |
| 698 | 817 | ||
| 699 | #endif | 818 | #endif |
diff --git a/net/rds/recv.c b/net/rds/recv.c index c93588c2d553..68800f02aa30 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c | |||
| @@ -36,7 +36,6 @@ | |||
| 36 | #include <linux/in.h> | 36 | #include <linux/in.h> |
| 37 | 37 | ||
| 38 | #include "rds.h" | 38 | #include "rds.h" |
| 39 | #include "rdma.h" | ||
| 40 | 39 | ||
| 41 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, | 40 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, |
| 42 | __be32 saddr) | 41 | __be32 saddr) |
| @@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, | |||
| 210 | } | 209 | } |
| 211 | 210 | ||
| 212 | rs = rds_find_bound(daddr, inc->i_hdr.h_dport); | 211 | rs = rds_find_bound(daddr, inc->i_hdr.h_dport); |
| 213 | if (rs == NULL) { | 212 | if (!rs) { |
| 214 | rds_stats_inc(s_recv_drop_no_sock); | 213 | rds_stats_inc(s_recv_drop_no_sock); |
| 215 | goto out; | 214 | goto out; |
| 216 | } | 215 | } |
| @@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) | |||
| 251 | { | 250 | { |
| 252 | unsigned long flags; | 251 | unsigned long flags; |
| 253 | 252 | ||
| 254 | if (*inc == NULL) { | 253 | if (!*inc) { |
| 255 | read_lock_irqsave(&rs->rs_recv_lock, flags); | 254 | read_lock_irqsave(&rs->rs_recv_lock, flags); |
| 256 | if (!list_empty(&rs->rs_recv_queue)) { | 255 | if (!list_empty(&rs->rs_recv_queue)) { |
| 257 | *inc = list_entry(rs->rs_recv_queue.next, | 256 | *inc = list_entry(rs->rs_recv_queue.next, |
| @@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) | |||
| 334 | 333 | ||
| 335 | if (msghdr) { | 334 | if (msghdr) { |
| 336 | cmsg.user_token = notifier->n_user_token; | 335 | cmsg.user_token = notifier->n_user_token; |
| 337 | cmsg.status = notifier->n_status; | 336 | cmsg.status = notifier->n_status; |
| 338 | 337 | ||
| 339 | err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, | 338 | err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, |
| 340 | sizeof(cmsg), &cmsg); | 339 | sizeof(cmsg), &cmsg); |
| 341 | if (err) | 340 | if (err) |
| 342 | break; | 341 | break; |
| 343 | } | 342 | } |
diff --git a/net/rds/send.c b/net/rds/send.c index 9c1c6bcaa6c9..9b951a0ab6b7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c | |||
| @@ -37,7 +37,6 @@ | |||
| 37 | #include <linux/list.h> | 37 | #include <linux/list.h> |
| 38 | 38 | ||
| 39 | #include "rds.h" | 39 | #include "rds.h" |
| 40 | #include "rdma.h" | ||
| 41 | 40 | ||
| 42 | /* When transmitting messages in rds_send_xmit, we need to emerge from | 41 | /* When transmitting messages in rds_send_xmit, we need to emerge from |
| 43 | * time to time and briefly release the CPU. Otherwise the softlock watchdog | 42 | * time to time and briefly release the CPU. Otherwise the softlock watchdog |
| @@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444); | |||
| 54 | MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); | 53 | MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); |
| 55 | 54 | ||
| 56 | /* | 55 | /* |
| 57 | * Reset the send state. Caller must hold c_send_lock when calling here. | 56 | * Reset the send state. Callers must ensure that this doesn't race with |
| 57 | * rds_send_xmit(). | ||
| 58 | */ | 58 | */ |
| 59 | void rds_send_reset(struct rds_connection *conn) | 59 | void rds_send_reset(struct rds_connection *conn) |
| 60 | { | 60 | { |
| @@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn) | |||
| 62 | unsigned long flags; | 62 | unsigned long flags; |
| 63 | 63 | ||
| 64 | if (conn->c_xmit_rm) { | 64 | if (conn->c_xmit_rm) { |
| 65 | rm = conn->c_xmit_rm; | ||
| 66 | conn->c_xmit_rm = NULL; | ||
| 65 | /* Tell the user the RDMA op is no longer mapped by the | 67 | /* Tell the user the RDMA op is no longer mapped by the |
| 66 | * transport. This isn't entirely true (it's flushed out | 68 | * transport. This isn't entirely true (it's flushed out |
| 67 | * independently) but as the connection is down, there's | 69 | * independently) but as the connection is down, there's |
| 68 | * no ongoing RDMA to/from that memory */ | 70 | * no ongoing RDMA to/from that memory */ |
| 69 | rds_message_unmapped(conn->c_xmit_rm); | 71 | rds_message_unmapped(rm); |
| 70 | rds_message_put(conn->c_xmit_rm); | 72 | rds_message_put(rm); |
| 71 | conn->c_xmit_rm = NULL; | ||
| 72 | } | 73 | } |
| 74 | |||
| 73 | conn->c_xmit_sg = 0; | 75 | conn->c_xmit_sg = 0; |
| 74 | conn->c_xmit_hdr_off = 0; | 76 | conn->c_xmit_hdr_off = 0; |
| 75 | conn->c_xmit_data_off = 0; | 77 | conn->c_xmit_data_off = 0; |
| 78 | conn->c_xmit_atomic_sent = 0; | ||
| 76 | conn->c_xmit_rdma_sent = 0; | 79 | conn->c_xmit_rdma_sent = 0; |
| 80 | conn->c_xmit_data_sent = 0; | ||
| 77 | 81 | ||
| 78 | conn->c_map_queued = 0; | 82 | conn->c_map_queued = 0; |
| 79 | 83 | ||
| @@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn) | |||
| 90 | spin_unlock_irqrestore(&conn->c_lock, flags); | 94 | spin_unlock_irqrestore(&conn->c_lock, flags); |
| 91 | } | 95 | } |
| 92 | 96 | ||
| 97 | static int acquire_in_xmit(struct rds_connection *conn) | ||
| 98 | { | ||
| 99 | return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; | ||
| 100 | } | ||
| 101 | |||
| 102 | static void release_in_xmit(struct rds_connection *conn) | ||
| 103 | { | ||
| 104 | clear_bit(RDS_IN_XMIT, &conn->c_flags); | ||
| 105 | smp_mb__after_clear_bit(); | ||
| 106 | /* | ||
| 107 | * We don't use wait_on_bit()/wake_up_bit() because our waking is in a | ||
| 108 | * hot path and finding waiters is very rare. We don't want to walk | ||
| 109 | * the system-wide hashed waitqueue buckets in the fast path only to | ||
| 110 | * almost never find waiters. | ||
| 111 | */ | ||
| 112 | if (waitqueue_active(&conn->c_waitq)) | ||
| 113 | wake_up_all(&conn->c_waitq); | ||
| 114 | } | ||
| 115 | |||
| 93 | /* | 116 | /* |
| 94 | * We're making the concious trade-off here to only send one message | 117 | * We're making the concious trade-off here to only send one message |
| 95 | * down the connection at a time. | 118 | * down the connection at a time. |
| @@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn) | |||
| 109 | struct rds_message *rm; | 132 | struct rds_message *rm; |
| 110 | unsigned long flags; | 133 | unsigned long flags; |
| 111 | unsigned int tmp; | 134 | unsigned int tmp; |
| 112 | unsigned int send_quota = send_batch_count; | ||
| 113 | struct scatterlist *sg; | 135 | struct scatterlist *sg; |
| 114 | int ret = 0; | 136 | int ret = 0; |
| 115 | int was_empty = 0; | ||
| 116 | LIST_HEAD(to_be_dropped); | 137 | LIST_HEAD(to_be_dropped); |
| 117 | 138 | ||
| 139 | restart: | ||
| 140 | |||
| 118 | /* | 141 | /* |
| 119 | * sendmsg calls here after having queued its message on the send | 142 | * sendmsg calls here after having queued its message on the send |
| 120 | * queue. We only have one task feeding the connection at a time. If | 143 | * queue. We only have one task feeding the connection at a time. If |
| 121 | * another thread is already feeding the queue then we back off. This | 144 | * another thread is already feeding the queue then we back off. This |
| 122 | * avoids blocking the caller and trading per-connection data between | 145 | * avoids blocking the caller and trading per-connection data between |
| 123 | * caches per message. | 146 | * caches per message. |
| 124 | * | ||
| 125 | * The sem holder will issue a retry if they notice that someone queued | ||
| 126 | * a message after they stopped walking the send queue but before they | ||
| 127 | * dropped the sem. | ||
| 128 | */ | 147 | */ |
| 129 | if (!mutex_trylock(&conn->c_send_lock)) { | 148 | if (!acquire_in_xmit(conn)) { |
| 130 | rds_stats_inc(s_send_sem_contention); | 149 | rds_stats_inc(s_send_lock_contention); |
| 131 | ret = -ENOMEM; | 150 | ret = -ENOMEM; |
| 132 | goto out; | 151 | goto out; |
| 133 | } | 152 | } |
| 134 | 153 | ||
| 154 | /* | ||
| 155 | * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, | ||
| 156 | * we do the opposite to avoid races. | ||
| 157 | */ | ||
| 158 | if (!rds_conn_up(conn)) { | ||
| 159 | release_in_xmit(conn); | ||
| 160 | ret = 0; | ||
| 161 | goto out; | ||
| 162 | } | ||
| 163 | |||
| 135 | if (conn->c_trans->xmit_prepare) | 164 | if (conn->c_trans->xmit_prepare) |
| 136 | conn->c_trans->xmit_prepare(conn); | 165 | conn->c_trans->xmit_prepare(conn); |
| 137 | 166 | ||
| 138 | /* | 167 | /* |
| 139 | * spin trying to push headers and data down the connection until | 168 | * spin trying to push headers and data down the connection until |
| 140 | * the connection doens't make forward progress. | 169 | * the connection doesn't make forward progress. |
| 141 | */ | 170 | */ |
| 142 | while (--send_quota) { | 171 | while (1) { |
| 143 | /* | ||
| 144 | * See if need to send a congestion map update if we're | ||
| 145 | * between sending messages. The send_sem protects our sole | ||
| 146 | * use of c_map_offset and _bytes. | ||
| 147 | * Note this is used only by transports that define a special | ||
| 148 | * xmit_cong_map function. For all others, we create allocate | ||
| 149 | * a cong_map message and treat it just like any other send. | ||
| 150 | */ | ||
| 151 | if (conn->c_map_bytes) { | ||
| 152 | ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, | ||
| 153 | conn->c_map_offset); | ||
| 154 | if (ret <= 0) | ||
| 155 | break; | ||
| 156 | 172 | ||
| 157 | conn->c_map_offset += ret; | ||
| 158 | conn->c_map_bytes -= ret; | ||
| 159 | if (conn->c_map_bytes) | ||
| 160 | continue; | ||
| 161 | } | ||
| 162 | |||
| 163 | /* If we're done sending the current message, clear the | ||
| 164 | * offset and S/G temporaries. | ||
| 165 | */ | ||
| 166 | rm = conn->c_xmit_rm; | 173 | rm = conn->c_xmit_rm; |
| 167 | if (rm != NULL && | ||
| 168 | conn->c_xmit_hdr_off == sizeof(struct rds_header) && | ||
| 169 | conn->c_xmit_sg == rm->m_nents) { | ||
| 170 | conn->c_xmit_rm = NULL; | ||
| 171 | conn->c_xmit_sg = 0; | ||
| 172 | conn->c_xmit_hdr_off = 0; | ||
| 173 | conn->c_xmit_data_off = 0; | ||
| 174 | conn->c_xmit_rdma_sent = 0; | ||
| 175 | 174 | ||
| 176 | /* Release the reference to the previous message. */ | 175 | /* |
| 177 | rds_message_put(rm); | 176 | * If between sending messages, we can send a pending congestion |
| 178 | rm = NULL; | 177 | * map update. |
| 179 | } | ||
| 180 | |||
| 181 | /* If we're asked to send a cong map update, do so. | ||
| 182 | */ | 178 | */ |
| 183 | if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { | 179 | if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { |
| 184 | if (conn->c_trans->xmit_cong_map != NULL) { | ||
| 185 | conn->c_map_offset = 0; | ||
| 186 | conn->c_map_bytes = sizeof(struct rds_header) + | ||
| 187 | RDS_CONG_MAP_BYTES; | ||
| 188 | continue; | ||
| 189 | } | ||
| 190 | |||
| 191 | rm = rds_cong_update_alloc(conn); | 180 | rm = rds_cong_update_alloc(conn); |
| 192 | if (IS_ERR(rm)) { | 181 | if (IS_ERR(rm)) { |
| 193 | ret = PTR_ERR(rm); | 182 | ret = PTR_ERR(rm); |
| 194 | break; | 183 | break; |
| 195 | } | 184 | } |
| 185 | rm->data.op_active = 1; | ||
| 196 | 186 | ||
| 197 | conn->c_xmit_rm = rm; | 187 | conn->c_xmit_rm = rm; |
| 198 | } | 188 | } |
| 199 | 189 | ||
| 200 | /* | 190 | /* |
| 201 | * Grab the next message from the send queue, if there is one. | 191 | * If not already working on one, grab the next message. |
| 202 | * | 192 | * |
| 203 | * c_xmit_rm holds a ref while we're sending this message down | 193 | * c_xmit_rm holds a ref while we're sending this message down |
| 204 | * the connction. We can use this ref while holding the | 194 | * the connction. We can use this ref while holding the |
| 205 | * send_sem.. rds_send_reset() is serialized with it. | 195 | * send_sem.. rds_send_reset() is serialized with it. |
| 206 | */ | 196 | */ |
| 207 | if (rm == NULL) { | 197 | if (!rm) { |
| 208 | unsigned int len; | 198 | unsigned int len; |
| 209 | 199 | ||
| 210 | spin_lock_irqsave(&conn->c_lock, flags); | 200 | spin_lock_irqsave(&conn->c_lock, flags); |
| @@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn) | |||
| 224 | 214 | ||
| 225 | spin_unlock_irqrestore(&conn->c_lock, flags); | 215 | spin_unlock_irqrestore(&conn->c_lock, flags); |
| 226 | 216 | ||
| 227 | if (rm == NULL) { | 217 | if (!rm) |
| 228 | was_empty = 1; | ||
| 229 | break; | 218 | break; |
| 230 | } | ||
| 231 | 219 | ||
| 232 | /* Unfortunately, the way Infiniband deals with | 220 | /* Unfortunately, the way Infiniband deals with |
| 233 | * RDMA to a bad MR key is by moving the entire | 221 | * RDMA to a bad MR key is by moving the entire |
| @@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn) | |||
| 236 | * connection. | 224 | * connection. |
| 237 | * Therefore, we never retransmit messages with RDMA ops. | 225 | * Therefore, we never retransmit messages with RDMA ops. |
| 238 | */ | 226 | */ |
| 239 | if (rm->m_rdma_op && | 227 | if (rm->rdma.op_active && |
| 240 | test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { | 228 | test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { |
| 241 | spin_lock_irqsave(&conn->c_lock, flags); | 229 | spin_lock_irqsave(&conn->c_lock, flags); |
| 242 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) | 230 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) |
| 243 | list_move(&rm->m_conn_item, &to_be_dropped); | 231 | list_move(&rm->m_conn_item, &to_be_dropped); |
| 244 | spin_unlock_irqrestore(&conn->c_lock, flags); | 232 | spin_unlock_irqrestore(&conn->c_lock, flags); |
| 245 | rds_message_put(rm); | ||
| 246 | continue; | 233 | continue; |
| 247 | } | 234 | } |
| 248 | 235 | ||
| @@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn) | |||
| 263 | conn->c_xmit_rm = rm; | 250 | conn->c_xmit_rm = rm; |
| 264 | } | 251 | } |
| 265 | 252 | ||
| 266 | /* | 253 | /* The transport either sends the whole rdma or none of it */ |
| 267 | * Try and send an rdma message. Let's see if we can | 254 | if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { |
| 268 | * keep this simple and require that the transport either | 255 | rm->m_final_op = &rm->rdma; |
| 269 | * send the whole rdma or none of it. | 256 | ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); |
| 270 | */ | ||
| 271 | if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { | ||
| 272 | ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); | ||
| 273 | if (ret) | 257 | if (ret) |
| 274 | break; | 258 | break; |
| 275 | conn->c_xmit_rdma_sent = 1; | 259 | conn->c_xmit_rdma_sent = 1; |
| 260 | |||
| 276 | /* The transport owns the mapped memory for now. | 261 | /* The transport owns the mapped memory for now. |
| 277 | * You can't unmap it while it's on the send queue */ | 262 | * You can't unmap it while it's on the send queue */ |
| 278 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); | 263 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); |
| 279 | } | 264 | } |
| 280 | 265 | ||
| 281 | if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || | 266 | if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { |
| 282 | conn->c_xmit_sg < rm->m_nents) { | 267 | rm->m_final_op = &rm->atomic; |
| 268 | ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); | ||
| 269 | if (ret) | ||
| 270 | break; | ||
| 271 | conn->c_xmit_atomic_sent = 1; | ||
| 272 | |||
| 273 | /* The transport owns the mapped memory for now. | ||
| 274 | * You can't unmap it while it's on the send queue */ | ||
| 275 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * A number of cases require an RDS header to be sent | ||
| 280 | * even if there is no data. | ||
| 281 | * We permit 0-byte sends; rds-ping depends on this. | ||
| 282 | * However, if there are exclusively attached silent ops, | ||
| 283 | * we skip the hdr/data send, to enable silent operation. | ||
| 284 | */ | ||
| 285 | if (rm->data.op_nents == 0) { | ||
| 286 | int ops_present; | ||
| 287 | int all_ops_are_silent = 1; | ||
| 288 | |||
| 289 | ops_present = (rm->atomic.op_active || rm->rdma.op_active); | ||
| 290 | if (rm->atomic.op_active && !rm->atomic.op_silent) | ||
| 291 | all_ops_are_silent = 0; | ||
| 292 | if (rm->rdma.op_active && !rm->rdma.op_silent) | ||
| 293 | all_ops_are_silent = 0; | ||
| 294 | |||
| 295 | if (ops_present && all_ops_are_silent | ||
| 296 | && !rm->m_rdma_cookie) | ||
| 297 | rm->data.op_active = 0; | ||
| 298 | } | ||
| 299 | |||
| 300 | if (rm->data.op_active && !conn->c_xmit_data_sent) { | ||
| 301 | rm->m_final_op = &rm->data; | ||
| 283 | ret = conn->c_trans->xmit(conn, rm, | 302 | ret = conn->c_trans->xmit(conn, rm, |
| 284 | conn->c_xmit_hdr_off, | 303 | conn->c_xmit_hdr_off, |
| 285 | conn->c_xmit_sg, | 304 | conn->c_xmit_sg, |
| @@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn) | |||
| 295 | ret -= tmp; | 314 | ret -= tmp; |
| 296 | } | 315 | } |
| 297 | 316 | ||
| 298 | sg = &rm->m_sg[conn->c_xmit_sg]; | 317 | sg = &rm->data.op_sg[conn->c_xmit_sg]; |
| 299 | while (ret) { | 318 | while (ret) { |
| 300 | tmp = min_t(int, ret, sg->length - | 319 | tmp = min_t(int, ret, sg->length - |
| 301 | conn->c_xmit_data_off); | 320 | conn->c_xmit_data_off); |
| @@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn) | |||
| 306 | sg++; | 325 | sg++; |
| 307 | conn->c_xmit_sg++; | 326 | conn->c_xmit_sg++; |
| 308 | BUG_ON(ret != 0 && | 327 | BUG_ON(ret != 0 && |
| 309 | conn->c_xmit_sg == rm->m_nents); | 328 | conn->c_xmit_sg == rm->data.op_nents); |
| 310 | } | 329 | } |
| 311 | } | 330 | } |
| 331 | |||
| 332 | if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && | ||
| 333 | (conn->c_xmit_sg == rm->data.op_nents)) | ||
| 334 | conn->c_xmit_data_sent = 1; | ||
| 312 | } | 335 | } |
| 313 | } | ||
| 314 | 336 | ||
| 315 | /* Nuke any messages we decided not to retransmit. */ | 337 | /* |
| 316 | if (!list_empty(&to_be_dropped)) | 338 | * A rm will only take multiple times through this loop |
| 317 | rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); | 339 | * if there is a data op. Thus, if the data is sent (or there was |
| 340 | * none), then we're done with the rm. | ||
| 341 | */ | ||
| 342 | if (!rm->data.op_active || conn->c_xmit_data_sent) { | ||
| 343 | conn->c_xmit_rm = NULL; | ||
| 344 | conn->c_xmit_sg = 0; | ||
| 345 | conn->c_xmit_hdr_off = 0; | ||
| 346 | conn->c_xmit_data_off = 0; | ||
| 347 | conn->c_xmit_rdma_sent = 0; | ||
| 348 | conn->c_xmit_atomic_sent = 0; | ||
| 349 | conn->c_xmit_data_sent = 0; | ||
| 350 | |||
| 351 | rds_message_put(rm); | ||
| 352 | } | ||
| 353 | } | ||
| 318 | 354 | ||
| 319 | if (conn->c_trans->xmit_complete) | 355 | if (conn->c_trans->xmit_complete) |
| 320 | conn->c_trans->xmit_complete(conn); | 356 | conn->c_trans->xmit_complete(conn); |
| 321 | 357 | ||
| 322 | /* | 358 | release_in_xmit(conn); |
| 323 | * We might be racing with another sender who queued a message but | ||
| 324 | * backed off on noticing that we held the c_send_lock. If we check | ||
| 325 | * for queued messages after dropping the sem then either we'll | ||
| 326 | * see the queued message or the queuer will get the sem. If we | ||
| 327 | * notice the queued message then we trigger an immediate retry. | ||
| 328 | * | ||
| 329 | * We need to be careful only to do this when we stopped processing | ||
| 330 | * the send queue because it was empty. It's the only way we | ||
| 331 | * stop processing the loop when the transport hasn't taken | ||
| 332 | * responsibility for forward progress. | ||
| 333 | */ | ||
| 334 | mutex_unlock(&conn->c_send_lock); | ||
| 335 | 359 | ||
| 336 | if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { | 360 | /* Nuke any messages we decided not to retransmit. */ |
| 337 | /* We exhausted the send quota, but there's work left to | 361 | if (!list_empty(&to_be_dropped)) { |
| 338 | * do. Return and (re-)schedule the send worker. | 362 | /* irqs on here, so we can put(), unlike above */ |
| 339 | */ | 363 | list_for_each_entry(rm, &to_be_dropped, m_conn_item) |
| 340 | ret = -EAGAIN; | 364 | rds_message_put(rm); |
| 365 | rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); | ||
| 341 | } | 366 | } |
| 342 | 367 | ||
| 343 | if (ret == 0 && was_empty) { | 368 | /* |
| 344 | /* A simple bit test would be way faster than taking the | 369 | * Other senders can queue a message after we last test the send queue |
| 345 | * spin lock */ | 370 | * but before we clear RDS_IN_XMIT. In that case they'd back off and |
| 346 | spin_lock_irqsave(&conn->c_lock, flags); | 371 | * not try and send their newly queued message. We need to check the |
| 372 | * send queue after having cleared RDS_IN_XMIT so that their message | ||
| 373 | * doesn't get stuck on the send queue. | ||
| 374 | * | ||
| 375 | * If the transport cannot continue (i.e ret != 0), then it must | ||
| 376 | * call us when more room is available, such as from the tx | ||
| 377 | * completion handler. | ||
| 378 | */ | ||
| 379 | if (ret == 0) { | ||
| 380 | smp_mb(); | ||
| 347 | if (!list_empty(&conn->c_send_queue)) { | 381 | if (!list_empty(&conn->c_send_queue)) { |
| 348 | rds_stats_inc(s_send_sem_queue_raced); | 382 | rds_stats_inc(s_send_lock_queue_raced); |
| 349 | ret = -EAGAIN; | 383 | goto restart; |
| 350 | } | 384 | } |
| 351 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
| 352 | } | 385 | } |
| 353 | out: | 386 | out: |
| 354 | return ret; | 387 | return ret; |
| @@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, | |||
| 376 | } | 409 | } |
| 377 | 410 | ||
| 378 | /* | 411 | /* |
| 379 | * Returns true if there are no messages on the send and retransmit queues | 412 | * This is pretty similar to what happens below in the ACK |
| 380 | * which have a sequence number greater than or equal to the given sequence | 413 | * handling code - except that we call here as soon as we get |
| 381 | * number. | 414 | * the IB send completion on the RDMA op and the accompanying |
| 415 | * message. | ||
| 382 | */ | 416 | */ |
| 383 | int rds_send_acked_before(struct rds_connection *conn, u64 seq) | 417 | void rds_rdma_send_complete(struct rds_message *rm, int status) |
| 384 | { | 418 | { |
| 385 | struct rds_message *rm, *tmp; | 419 | struct rds_sock *rs = NULL; |
| 386 | int ret = 1; | 420 | struct rm_rdma_op *ro; |
| 421 | struct rds_notifier *notifier; | ||
| 422 | unsigned long flags; | ||
| 387 | 423 | ||
| 388 | spin_lock(&conn->c_lock); | 424 | spin_lock_irqsave(&rm->m_rs_lock, flags); |
| 389 | 425 | ||
| 390 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | 426 | ro = &rm->rdma; |
| 391 | if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) | 427 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && |
| 392 | ret = 0; | 428 | ro->op_active && ro->op_notify && ro->op_notifier) { |
| 393 | break; | 429 | notifier = ro->op_notifier; |
| 394 | } | 430 | rs = rm->m_rs; |
| 431 | sock_hold(rds_rs_to_sk(rs)); | ||
| 395 | 432 | ||
| 396 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { | 433 | notifier->n_status = status; |
| 397 | if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) | 434 | spin_lock(&rs->rs_lock); |
| 398 | ret = 0; | 435 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); |
| 399 | break; | 436 | spin_unlock(&rs->rs_lock); |
| 437 | |||
| 438 | ro->op_notifier = NULL; | ||
| 400 | } | 439 | } |
| 401 | 440 | ||
| 402 | spin_unlock(&conn->c_lock); | 441 | spin_unlock_irqrestore(&rm->m_rs_lock, flags); |
| 403 | 442 | ||
| 404 | return ret; | 443 | if (rs) { |
| 444 | rds_wake_sk_sleep(rs); | ||
| 445 | sock_put(rds_rs_to_sk(rs)); | ||
| 446 | } | ||
| 405 | } | 447 | } |
| 448 | EXPORT_SYMBOL_GPL(rds_rdma_send_complete); | ||
| 406 | 449 | ||
| 407 | /* | 450 | /* |
| 408 | * This is pretty similar to what happens below in the ACK | 451 | * Just like above, except looks at atomic op |
| 409 | * handling code - except that we call here as soon as we get | ||
| 410 | * the IB send completion on the RDMA op and the accompanying | ||
| 411 | * message. | ||
| 412 | */ | 452 | */ |
| 413 | void rds_rdma_send_complete(struct rds_message *rm, int status) | 453 | void rds_atomic_send_complete(struct rds_message *rm, int status) |
| 414 | { | 454 | { |
| 415 | struct rds_sock *rs = NULL; | 455 | struct rds_sock *rs = NULL; |
| 416 | struct rds_rdma_op *ro; | 456 | struct rm_atomic_op *ao; |
| 417 | struct rds_notifier *notifier; | 457 | struct rds_notifier *notifier; |
| 458 | unsigned long flags; | ||
| 418 | 459 | ||
| 419 | spin_lock(&rm->m_rs_lock); | 460 | spin_lock_irqsave(&rm->m_rs_lock, flags); |
| 420 | 461 | ||
| 421 | ro = rm->m_rdma_op; | 462 | ao = &rm->atomic; |
| 422 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && | 463 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) |
| 423 | ro && ro->r_notify && ro->r_notifier) { | 464 | && ao->op_active && ao->op_notify && ao->op_notifier) { |
| 424 | notifier = ro->r_notifier; | 465 | notifier = ao->op_notifier; |
| 425 | rs = rm->m_rs; | 466 | rs = rm->m_rs; |
| 426 | sock_hold(rds_rs_to_sk(rs)); | 467 | sock_hold(rds_rs_to_sk(rs)); |
| 427 | 468 | ||
| @@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) | |||
| 430 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); | 471 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); |
| 431 | spin_unlock(&rs->rs_lock); | 472 | spin_unlock(&rs->rs_lock); |
| 432 | 473 | ||
| 433 | ro->r_notifier = NULL; | 474 | ao->op_notifier = NULL; |
| 434 | } | 475 | } |
| 435 | 476 | ||
| 436 | spin_unlock(&rm->m_rs_lock); | 477 | spin_unlock_irqrestore(&rm->m_rs_lock, flags); |
| 437 | 478 | ||
| 438 | if (rs) { | 479 | if (rs) { |
| 439 | rds_wake_sk_sleep(rs); | 480 | rds_wake_sk_sleep(rs); |
| 440 | sock_put(rds_rs_to_sk(rs)); | 481 | sock_put(rds_rs_to_sk(rs)); |
| 441 | } | 482 | } |
| 442 | } | 483 | } |
| 443 | EXPORT_SYMBOL_GPL(rds_rdma_send_complete); | 484 | EXPORT_SYMBOL_GPL(rds_atomic_send_complete); |
| 444 | 485 | ||
| 445 | /* | 486 | /* |
| 446 | * This is the same as rds_rdma_send_complete except we | 487 | * This is the same as rds_rdma_send_complete except we |
| @@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete); | |||
| 448 | * socket, socket lock) and can just move the notifier. | 489 | * socket, socket lock) and can just move the notifier. |
| 449 | */ | 490 | */ |
| 450 | static inline void | 491 | static inline void |
| 451 | __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) | 492 | __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) |
| 452 | { | 493 | { |
| 453 | struct rds_rdma_op *ro; | 494 | struct rm_rdma_op *ro; |
| 495 | struct rm_atomic_op *ao; | ||
| 496 | |||
| 497 | ro = &rm->rdma; | ||
| 498 | if (ro->op_active && ro->op_notify && ro->op_notifier) { | ||
| 499 | ro->op_notifier->n_status = status; | ||
| 500 | list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); | ||
| 501 | ro->op_notifier = NULL; | ||
| 502 | } | ||
| 454 | 503 | ||
| 455 | ro = rm->m_rdma_op; | 504 | ao = &rm->atomic; |
| 456 | if (ro && ro->r_notify && ro->r_notifier) { | 505 | if (ao->op_active && ao->op_notify && ao->op_notifier) { |
| 457 | ro->r_notifier->n_status = status; | 506 | ao->op_notifier->n_status = status; |
| 458 | list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); | 507 | list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); |
| 459 | ro->r_notifier = NULL; | 508 | ao->op_notifier = NULL; |
| 460 | } | 509 | } |
| 461 | 510 | ||
| 462 | /* No need to wake the app - caller does this */ | 511 | /* No need to wake the app - caller does this */ |
| @@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status | |||
| 468 | * So speed is not an issue here. | 517 | * So speed is not an issue here. |
| 469 | */ | 518 | */ |
| 470 | struct rds_message *rds_send_get_message(struct rds_connection *conn, | 519 | struct rds_message *rds_send_get_message(struct rds_connection *conn, |
| 471 | struct rds_rdma_op *op) | 520 | struct rm_rdma_op *op) |
| 472 | { | 521 | { |
| 473 | struct rds_message *rm, *tmp, *found = NULL; | 522 | struct rds_message *rm, *tmp, *found = NULL; |
| 474 | unsigned long flags; | 523 | unsigned long flags; |
| @@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, | |||
| 476 | spin_lock_irqsave(&conn->c_lock, flags); | 525 | spin_lock_irqsave(&conn->c_lock, flags); |
| 477 | 526 | ||
| 478 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | 527 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { |
| 479 | if (rm->m_rdma_op == op) { | 528 | if (&rm->rdma == op) { |
| 480 | atomic_inc(&rm->m_refcount); | 529 | atomic_inc(&rm->m_refcount); |
| 481 | found = rm; | 530 | found = rm; |
| 482 | goto out; | 531 | goto out; |
| @@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, | |||
| 484 | } | 533 | } |
| 485 | 534 | ||
| 486 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { | 535 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { |
| 487 | if (rm->m_rdma_op == op) { | 536 | if (&rm->rdma == op) { |
| 488 | atomic_inc(&rm->m_refcount); | 537 | atomic_inc(&rm->m_refcount); |
| 489 | found = rm; | 538 | found = rm; |
| 490 | break; | 539 | break; |
| @@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) | |||
| 544 | spin_lock(&rs->rs_lock); | 593 | spin_lock(&rs->rs_lock); |
| 545 | 594 | ||
| 546 | if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { | 595 | if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { |
| 547 | struct rds_rdma_op *ro = rm->m_rdma_op; | 596 | struct rm_rdma_op *ro = &rm->rdma; |
| 548 | struct rds_notifier *notifier; | 597 | struct rds_notifier *notifier; |
| 549 | 598 | ||
| 550 | list_del_init(&rm->m_sock_item); | 599 | list_del_init(&rm->m_sock_item); |
| 551 | rds_send_sndbuf_remove(rs, rm); | 600 | rds_send_sndbuf_remove(rs, rm); |
| 552 | 601 | ||
| 553 | if (ro && ro->r_notifier && (status || ro->r_notify)) { | 602 | if (ro->op_active && ro->op_notifier && |
| 554 | notifier = ro->r_notifier; | 603 | (ro->op_notify || (ro->op_recverr && status))) { |
| 604 | notifier = ro->op_notifier; | ||
| 555 | list_add_tail(¬ifier->n_list, | 605 | list_add_tail(¬ifier->n_list, |
| 556 | &rs->rs_notify_queue); | 606 | &rs->rs_notify_queue); |
| 557 | if (!notifier->n_status) | 607 | if (!notifier->n_status) |
| 558 | notifier->n_status = status; | 608 | notifier->n_status = status; |
| 559 | rm->m_rdma_op->r_notifier = NULL; | 609 | rm->rdma.op_notifier = NULL; |
| 560 | } | 610 | } |
| 561 | was_on_sock = 1; | 611 | was_on_sock = 1; |
| 562 | rm->m_rs = NULL; | 612 | rm->m_rs = NULL; |
| @@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) | |||
| 619 | { | 669 | { |
| 620 | struct rds_message *rm, *tmp; | 670 | struct rds_message *rm, *tmp; |
| 621 | struct rds_connection *conn; | 671 | struct rds_connection *conn; |
| 622 | unsigned long flags, flags2; | 672 | unsigned long flags; |
| 623 | LIST_HEAD(list); | 673 | LIST_HEAD(list); |
| 624 | int wake = 0; | ||
| 625 | 674 | ||
| 626 | /* get all the messages we're dropping under the rs lock */ | 675 | /* get all the messages we're dropping under the rs lock */ |
| 627 | spin_lock_irqsave(&rs->rs_lock, flags); | 676 | spin_lock_irqsave(&rs->rs_lock, flags); |
| @@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) | |||
| 631 | dest->sin_port != rm->m_inc.i_hdr.h_dport)) | 680 | dest->sin_port != rm->m_inc.i_hdr.h_dport)) |
| 632 | continue; | 681 | continue; |
| 633 | 682 | ||
| 634 | wake = 1; | ||
| 635 | list_move(&rm->m_sock_item, &list); | 683 | list_move(&rm->m_sock_item, &list); |
| 636 | rds_send_sndbuf_remove(rs, rm); | 684 | rds_send_sndbuf_remove(rs, rm); |
| 637 | clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); | 685 | clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); |
| 638 | } | 686 | } |
| 639 | 687 | ||
| 640 | /* order flag updates with the rs lock */ | 688 | /* order flag updates with the rs lock */ |
| 641 | if (wake) | 689 | smp_mb__after_clear_bit(); |
| 642 | smp_mb__after_clear_bit(); | ||
| 643 | 690 | ||
| 644 | spin_unlock_irqrestore(&rs->rs_lock, flags); | 691 | spin_unlock_irqrestore(&rs->rs_lock, flags); |
| 645 | 692 | ||
| 646 | conn = NULL; | 693 | if (list_empty(&list)) |
| 694 | return; | ||
| 647 | 695 | ||
| 648 | /* now remove the messages from the conn list as needed */ | 696 | /* Remove the messages from the conn */ |
| 649 | list_for_each_entry(rm, &list, m_sock_item) { | 697 | list_for_each_entry(rm, &list, m_sock_item) { |
| 650 | /* We do this here rather than in the loop above, so that | ||
| 651 | * we don't have to nest m_rs_lock under rs->rs_lock */ | ||
| 652 | spin_lock_irqsave(&rm->m_rs_lock, flags2); | ||
| 653 | /* If this is a RDMA operation, notify the app. */ | ||
| 654 | spin_lock(&rs->rs_lock); | ||
| 655 | __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); | ||
| 656 | spin_unlock(&rs->rs_lock); | ||
| 657 | rm->m_rs = NULL; | ||
| 658 | spin_unlock_irqrestore(&rm->m_rs_lock, flags2); | ||
| 659 | 698 | ||
| 699 | conn = rm->m_inc.i_conn; | ||
| 700 | |||
| 701 | spin_lock_irqsave(&conn->c_lock, flags); | ||
| 660 | /* | 702 | /* |
| 661 | * If we see this flag cleared then we're *sure* that someone | 703 | * Maybe someone else beat us to removing rm from the conn. |
| 662 | * else beat us to removing it from the conn. If we race | 704 | * If we race with their flag update we'll get the lock and |
| 663 | * with their flag update we'll get the lock and then really | 705 | * then really see that the flag has been cleared. |
| 664 | * see that the flag has been cleared. | ||
| 665 | */ | 706 | */ |
| 666 | if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) | 707 | if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { |
| 708 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
| 667 | continue; | 709 | continue; |
| 668 | |||
| 669 | if (conn != rm->m_inc.i_conn) { | ||
| 670 | if (conn) | ||
| 671 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
| 672 | conn = rm->m_inc.i_conn; | ||
| 673 | spin_lock_irqsave(&conn->c_lock, flags); | ||
| 674 | } | 710 | } |
| 711 | list_del_init(&rm->m_conn_item); | ||
| 712 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
| 675 | 713 | ||
| 676 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { | 714 | /* |
| 677 | list_del_init(&rm->m_conn_item); | 715 | * Couldn't grab m_rs_lock in top loop (lock ordering), |
| 678 | rds_message_put(rm); | 716 | * but we can now. |
| 679 | } | 717 | */ |
| 680 | } | 718 | spin_lock_irqsave(&rm->m_rs_lock, flags); |
| 681 | 719 | ||
| 682 | if (conn) | 720 | spin_lock(&rs->rs_lock); |
| 683 | spin_unlock_irqrestore(&conn->c_lock, flags); | 721 | __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); |
| 722 | spin_unlock(&rs->rs_lock); | ||
| 684 | 723 | ||
| 685 | if (wake) | 724 | rm->m_rs = NULL; |
| 686 | rds_wake_sk_sleep(rs); | 725 | spin_unlock_irqrestore(&rm->m_rs_lock, flags); |
| 726 | |||
| 727 | rds_message_put(rm); | ||
| 728 | } | ||
| 729 | |||
| 730 | rds_wake_sk_sleep(rs); | ||
| 687 | 731 | ||
| 688 | while (!list_empty(&list)) { | 732 | while (!list_empty(&list)) { |
| 689 | rm = list_entry(list.next, struct rds_message, m_sock_item); | 733 | rm = list_entry(list.next, struct rds_message, m_sock_item); |
| @@ -763,6 +807,63 @@ out: | |||
| 763 | return *queued; | 807 | return *queued; |
| 764 | } | 808 | } |
| 765 | 809 | ||
| 810 | /* | ||
| 811 | * rds_message is getting to be quite complicated, and we'd like to allocate | ||
| 812 | * it all in one go. This figures out how big it needs to be up front. | ||
| 813 | */ | ||
| 814 | static int rds_rm_size(struct msghdr *msg, int data_len) | ||
| 815 | { | ||
| 816 | struct cmsghdr *cmsg; | ||
| 817 | int size = 0; | ||
| 818 | int cmsg_groups = 0; | ||
| 819 | int retval; | ||
| 820 | |||
| 821 | for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { | ||
| 822 | if (!CMSG_OK(msg, cmsg)) | ||
| 823 | return -EINVAL; | ||
| 824 | |||
| 825 | if (cmsg->cmsg_level != SOL_RDS) | ||
| 826 | continue; | ||
| 827 | |||
| 828 | switch (cmsg->cmsg_type) { | ||
| 829 | case RDS_CMSG_RDMA_ARGS: | ||
| 830 | cmsg_groups |= 1; | ||
| 831 | retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); | ||
| 832 | if (retval < 0) | ||
| 833 | return retval; | ||
| 834 | size += retval; | ||
| 835 | |||
| 836 | break; | ||
| 837 | |||
| 838 | case RDS_CMSG_RDMA_DEST: | ||
| 839 | case RDS_CMSG_RDMA_MAP: | ||
| 840 | cmsg_groups |= 2; | ||
| 841 | /* these are valid but do no add any size */ | ||
| 842 | break; | ||
| 843 | |||
| 844 | case RDS_CMSG_ATOMIC_CSWP: | ||
| 845 | case RDS_CMSG_ATOMIC_FADD: | ||
| 846 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
| 847 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
| 848 | cmsg_groups |= 1; | ||
| 849 | size += sizeof(struct scatterlist); | ||
| 850 | break; | ||
| 851 | |||
| 852 | default: | ||
| 853 | return -EINVAL; | ||
| 854 | } | ||
| 855 | |||
| 856 | } | ||
| 857 | |||
| 858 | size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); | ||
| 859 | |||
| 860 | /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ | ||
| 861 | if (cmsg_groups == 3) | ||
| 862 | return -EINVAL; | ||
| 863 | |||
| 864 | return size; | ||
| 865 | } | ||
| 866 | |||
| 766 | static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | 867 | static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, |
| 767 | struct msghdr *msg, int *allocated_mr) | 868 | struct msghdr *msg, int *allocated_mr) |
| 768 | { | 869 | { |
| @@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | |||
| 777 | continue; | 878 | continue; |
| 778 | 879 | ||
| 779 | /* As a side effect, RDMA_DEST and RDMA_MAP will set | 880 | /* As a side effect, RDMA_DEST and RDMA_MAP will set |
| 780 | * rm->m_rdma_cookie and rm->m_rdma_mr. | 881 | * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. |
| 781 | */ | 882 | */ |
| 782 | switch (cmsg->cmsg_type) { | 883 | switch (cmsg->cmsg_type) { |
| 783 | case RDS_CMSG_RDMA_ARGS: | 884 | case RDS_CMSG_RDMA_ARGS: |
| @@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | |||
| 793 | if (!ret) | 894 | if (!ret) |
| 794 | *allocated_mr = 1; | 895 | *allocated_mr = 1; |
| 795 | break; | 896 | break; |
| 897 | case RDS_CMSG_ATOMIC_CSWP: | ||
| 898 | case RDS_CMSG_ATOMIC_FADD: | ||
| 899 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
| 900 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
| 901 | ret = rds_cmsg_atomic(rs, rm, cmsg); | ||
| 902 | break; | ||
| 796 | 903 | ||
| 797 | default: | 904 | default: |
| 798 | return -EINVAL; | 905 | return -EINVAL; |
| @@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
| 850 | goto out; | 957 | goto out; |
| 851 | } | 958 | } |
| 852 | 959 | ||
| 853 | rm = rds_message_copy_from_user(msg->msg_iov, payload_len); | 960 | /* size of rm including all sgs */ |
| 854 | if (IS_ERR(rm)) { | 961 | ret = rds_rm_size(msg, payload_len); |
| 855 | ret = PTR_ERR(rm); | 962 | if (ret < 0) |
| 856 | rm = NULL; | 963 | goto out; |
| 964 | |||
| 965 | rm = rds_message_alloc(ret, GFP_KERNEL); | ||
| 966 | if (!rm) { | ||
| 967 | ret = -ENOMEM; | ||
| 857 | goto out; | 968 | goto out; |
| 858 | } | 969 | } |
| 859 | 970 | ||
| 971 | /* Attach data to the rm */ | ||
| 972 | if (payload_len) { | ||
| 973 | rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); | ||
| 974 | ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); | ||
| 975 | if (ret) | ||
| 976 | goto out; | ||
| 977 | } | ||
| 978 | rm->data.op_active = 1; | ||
| 979 | |||
| 860 | rm->m_daddr = daddr; | 980 | rm->m_daddr = daddr; |
| 861 | 981 | ||
| 862 | /* rds_conn_create has a spinlock that runs with IRQ off. | 982 | /* rds_conn_create has a spinlock that runs with IRQ off. |
| @@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
| 879 | if (ret) | 999 | if (ret) |
| 880 | goto out; | 1000 | goto out; |
| 881 | 1001 | ||
| 882 | if ((rm->m_rdma_cookie || rm->m_rdma_op) && | 1002 | if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { |
| 883 | conn->c_trans->xmit_rdma == NULL) { | ||
| 884 | if (printk_ratelimit()) | 1003 | if (printk_ratelimit()) |
| 885 | printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", | 1004 | printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", |
| 886 | rm->m_rdma_op, conn->c_trans->xmit_rdma); | 1005 | &rm->rdma, conn->c_trans->xmit_rdma); |
| 887 | ret = -EOPNOTSUPP; | 1006 | ret = -EOPNOTSUPP; |
| 888 | goto out; | 1007 | goto out; |
| 889 | } | 1008 | } |
| 890 | 1009 | ||
| 891 | /* If the connection is down, trigger a connect. We may | 1010 | if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { |
| 892 | * have scheduled a delayed reconnect however - in this case | 1011 | if (printk_ratelimit()) |
| 893 | * we should not interfere. | 1012 | printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", |
| 894 | */ | 1013 | &rm->atomic, conn->c_trans->xmit_atomic); |
| 895 | if (rds_conn_state(conn) == RDS_CONN_DOWN && | 1014 | ret = -EOPNOTSUPP; |
| 896 | !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | 1015 | goto out; |
| 897 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | 1016 | } |
| 1017 | |||
| 1018 | rds_conn_connect_if_down(conn); | ||
| 898 | 1019 | ||
| 899 | ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); | 1020 | ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); |
| 900 | if (ret) { | 1021 | if (ret) { |
| @@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
| 938 | rds_stats_inc(s_send_queued); | 1059 | rds_stats_inc(s_send_queued); |
| 939 | 1060 | ||
| 940 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) | 1061 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) |
| 941 | rds_send_worker(&conn->c_send_w.work); | 1062 | rds_send_xmit(conn); |
| 942 | 1063 | ||
| 943 | rds_message_put(rm); | 1064 | rds_message_put(rm); |
| 944 | return payload_len; | 1065 | return payload_len; |
| @@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) | |||
| 966 | int ret = 0; | 1087 | int ret = 0; |
| 967 | 1088 | ||
| 968 | rm = rds_message_alloc(0, GFP_ATOMIC); | 1089 | rm = rds_message_alloc(0, GFP_ATOMIC); |
| 969 | if (rm == NULL) { | 1090 | if (!rm) { |
| 970 | ret = -ENOMEM; | 1091 | ret = -ENOMEM; |
| 971 | goto out; | 1092 | goto out; |
| 972 | } | 1093 | } |
| 973 | 1094 | ||
| 974 | rm->m_daddr = conn->c_faddr; | 1095 | rm->m_daddr = conn->c_faddr; |
| 1096 | rm->data.op_active = 1; | ||
| 975 | 1097 | ||
| 976 | /* If the connection is down, trigger a connect. We may | 1098 | rds_conn_connect_if_down(conn); |
| 977 | * have scheduled a delayed reconnect however - in this case | ||
| 978 | * we should not interfere. | ||
| 979 | */ | ||
| 980 | if (rds_conn_state(conn) == RDS_CONN_DOWN && | ||
| 981 | !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | ||
| 982 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
| 983 | 1099 | ||
| 984 | ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); | 1100 | ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); |
| 985 | if (ret) | 1101 | if (ret) |
| @@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) | |||
| 999 | rds_stats_inc(s_send_queued); | 1115 | rds_stats_inc(s_send_queued); |
| 1000 | rds_stats_inc(s_send_pong); | 1116 | rds_stats_inc(s_send_pong); |
| 1001 | 1117 | ||
| 1002 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | 1118 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) |
| 1119 | rds_send_xmit(conn); | ||
| 1120 | |||
| 1003 | rds_message_put(rm); | 1121 | rds_message_put(rm); |
| 1004 | return 0; | 1122 | return 0; |
| 1005 | 1123 | ||
diff --git a/net/rds/stats.c b/net/rds/stats.c index 7598eb07cfb1..10c759ccac0c 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c | |||
| @@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = { | |||
| 57 | "recv_ping", | 57 | "recv_ping", |
| 58 | "send_queue_empty", | 58 | "send_queue_empty", |
| 59 | "send_queue_full", | 59 | "send_queue_full", |
| 60 | "send_sem_contention", | 60 | "send_lock_contention", |
| 61 | "send_sem_queue_raced", | 61 | "send_lock_queue_raced", |
| 62 | "send_immediate_retry", | 62 | "send_immediate_retry", |
| 63 | "send_delayed_retry", | 63 | "send_delayed_retry", |
| 64 | "send_drop_acked", | 64 | "send_drop_acked", |
| @@ -143,7 +143,7 @@ void rds_stats_exit(void) | |||
| 143 | rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); | 143 | rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); |
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | int __init rds_stats_init(void) | 146 | int rds_stats_init(void) |
| 147 | { | 147 | { |
| 148 | rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); | 148 | rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); |
| 149 | return 0; | 149 | return 0; |
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 7829a20325d3..25ad0c77a26c 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c | |||
| @@ -105,13 +105,13 @@ void rds_sysctl_exit(void) | |||
| 105 | unregister_sysctl_table(rds_sysctl_reg_table); | 105 | unregister_sysctl_table(rds_sysctl_reg_table); |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | int __init rds_sysctl_init(void) | 108 | int rds_sysctl_init(void) |
| 109 | { | 109 | { |
| 110 | rds_sysctl_reconnect_min = msecs_to_jiffies(1); | 110 | rds_sysctl_reconnect_min = msecs_to_jiffies(1); |
| 111 | rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; | 111 | rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; |
| 112 | 112 | ||
| 113 | rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); | 113 | rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); |
| 114 | if (rds_sysctl_reg_table == NULL) | 114 | if (!rds_sysctl_reg_table) |
| 115 | return -ENOMEM; | 115 | return -ENOMEM; |
| 116 | return 0; | 116 | return 0; |
| 117 | } | 117 | } |
diff --git a/net/rds/tcp.c b/net/rds/tcp.c index babf4577ff7d..eeb08e6ab96b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c | |||
| @@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
| 200 | struct rds_tcp_connection *tc; | 200 | struct rds_tcp_connection *tc; |
| 201 | 201 | ||
| 202 | tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); | 202 | tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); |
| 203 | if (tc == NULL) | 203 | if (!tc) |
| 204 | return -ENOMEM; | 204 | return -ENOMEM; |
| 205 | 205 | ||
| 206 | tc->t_sock = NULL; | 206 | tc->t_sock = NULL; |
| @@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = { | |||
| 258 | .laddr_check = rds_tcp_laddr_check, | 258 | .laddr_check = rds_tcp_laddr_check, |
| 259 | .xmit_prepare = rds_tcp_xmit_prepare, | 259 | .xmit_prepare = rds_tcp_xmit_prepare, |
| 260 | .xmit_complete = rds_tcp_xmit_complete, | 260 | .xmit_complete = rds_tcp_xmit_complete, |
| 261 | .xmit_cong_map = rds_tcp_xmit_cong_map, | ||
| 262 | .xmit = rds_tcp_xmit, | 261 | .xmit = rds_tcp_xmit, |
| 263 | .recv = rds_tcp_recv, | 262 | .recv = rds_tcp_recv, |
| 264 | .conn_alloc = rds_tcp_conn_alloc, | 263 | .conn_alloc = rds_tcp_conn_alloc, |
| @@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = { | |||
| 266 | .conn_connect = rds_tcp_conn_connect, | 265 | .conn_connect = rds_tcp_conn_connect, |
| 267 | .conn_shutdown = rds_tcp_conn_shutdown, | 266 | .conn_shutdown = rds_tcp_conn_shutdown, |
| 268 | .inc_copy_to_user = rds_tcp_inc_copy_to_user, | 267 | .inc_copy_to_user = rds_tcp_inc_copy_to_user, |
| 269 | .inc_purge = rds_tcp_inc_purge, | ||
| 270 | .inc_free = rds_tcp_inc_free, | 268 | .inc_free = rds_tcp_inc_free, |
| 271 | .stats_info_copy = rds_tcp_stats_info_copy, | 269 | .stats_info_copy = rds_tcp_stats_info_copy, |
| 272 | .exit = rds_tcp_exit, | 270 | .exit = rds_tcp_exit, |
| @@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = { | |||
| 276 | .t_prefer_loopback = 1, | 274 | .t_prefer_loopback = 1, |
| 277 | }; | 275 | }; |
| 278 | 276 | ||
| 279 | int __init rds_tcp_init(void) | 277 | int rds_tcp_init(void) |
| 280 | { | 278 | { |
| 281 | int ret; | 279 | int ret; |
| 282 | 280 | ||
| 283 | rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", | 281 | rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", |
| 284 | sizeof(struct rds_tcp_connection), | 282 | sizeof(struct rds_tcp_connection), |
| 285 | 0, 0, NULL); | 283 | 0, 0, NULL); |
| 286 | if (rds_tcp_conn_slab == NULL) { | 284 | if (!rds_tcp_conn_slab) { |
| 287 | ret = -ENOMEM; | 285 | ret = -ENOMEM; |
| 288 | goto out; | 286 | goto out; |
| 289 | } | 287 | } |
diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 844fa6b9cf5a..f5e6f7bebb50 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h | |||
| @@ -43,7 +43,7 @@ struct rds_tcp_statistics { | |||
| 43 | }; | 43 | }; |
| 44 | 44 | ||
| 45 | /* tcp.c */ | 45 | /* tcp.c */ |
| 46 | int __init rds_tcp_init(void); | 46 | int rds_tcp_init(void); |
| 47 | void rds_tcp_exit(void); | 47 | void rds_tcp_exit(void); |
| 48 | void rds_tcp_tune(struct socket *sock); | 48 | void rds_tcp_tune(struct socket *sock); |
| 49 | void rds_tcp_nonagle(struct socket *sock); | 49 | void rds_tcp_nonagle(struct socket *sock); |
| @@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); | |||
| 61 | void rds_tcp_state_change(struct sock *sk); | 61 | void rds_tcp_state_change(struct sock *sk); |
| 62 | 62 | ||
| 63 | /* tcp_listen.c */ | 63 | /* tcp_listen.c */ |
| 64 | int __init rds_tcp_listen_init(void); | 64 | int rds_tcp_listen_init(void); |
| 65 | void rds_tcp_listen_stop(void); | 65 | void rds_tcp_listen_stop(void); |
| 66 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes); | 66 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes); |
| 67 | 67 | ||
| 68 | /* tcp_recv.c */ | 68 | /* tcp_recv.c */ |
| 69 | int __init rds_tcp_recv_init(void); | 69 | int rds_tcp_recv_init(void); |
| 70 | void rds_tcp_recv_exit(void); | 70 | void rds_tcp_recv_exit(void); |
| 71 | void rds_tcp_data_ready(struct sock *sk, int bytes); | 71 | void rds_tcp_data_ready(struct sock *sk, int bytes); |
| 72 | int rds_tcp_recv(struct rds_connection *conn); | 72 | int rds_tcp_recv(struct rds_connection *conn); |
| 73 | void rds_tcp_inc_purge(struct rds_incoming *inc); | ||
| 74 | void rds_tcp_inc_free(struct rds_incoming *inc); | 73 | void rds_tcp_inc_free(struct rds_incoming *inc); |
| 75 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | 74 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, |
| 76 | size_t size); | 75 | size_t size); |
| @@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn); | |||
| 81 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | 80 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, |
| 82 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 81 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
| 83 | void rds_tcp_write_space(struct sock *sk); | 82 | void rds_tcp_write_space(struct sock *sk); |
| 84 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
| 85 | struct rds_cong_map *map, unsigned long offset); | ||
| 86 | 83 | ||
| 87 | /* tcp_stats.c */ | 84 | /* tcp_stats.c */ |
| 88 | DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); | 85 | DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); |
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index c397524c039c..a65ee78db0c5 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c | |||
| @@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk) | |||
| 45 | 45 | ||
| 46 | read_lock(&sk->sk_callback_lock); | 46 | read_lock(&sk->sk_callback_lock); |
| 47 | conn = sk->sk_user_data; | 47 | conn = sk->sk_user_data; |
| 48 | if (conn == NULL) { | 48 | if (!conn) { |
| 49 | state_change = sk->sk_state_change; | 49 | state_change = sk->sk_state_change; |
| 50 | goto out; | 50 | goto out; |
| 51 | } | 51 | } |
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 975183fe6950..ae27869dfc21 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c | |||
| @@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) | |||
| 116 | 116 | ||
| 117 | read_lock(&sk->sk_callback_lock); | 117 | read_lock(&sk->sk_callback_lock); |
| 118 | ready = sk->sk_user_data; | 118 | ready = sk->sk_user_data; |
| 119 | if (ready == NULL) { /* check for teardown race */ | 119 | if (!ready) { /* check for teardown race */ |
| 120 | ready = sk->sk_data_ready; | 120 | ready = sk->sk_data_ready; |
| 121 | goto out; | 121 | goto out; |
| 122 | } | 122 | } |
| @@ -135,7 +135,7 @@ out: | |||
| 135 | ready(sk, bytes); | 135 | ready(sk, bytes); |
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | int __init rds_tcp_listen_init(void) | 138 | int rds_tcp_listen_init(void) |
| 139 | { | 139 | { |
| 140 | struct sockaddr_in sin; | 140 | struct sockaddr_in sin; |
| 141 | struct socket *sock = NULL; | 141 | struct socket *sock = NULL; |
| @@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void) | |||
| 178 | struct socket *sock = rds_tcp_listen_sock; | 178 | struct socket *sock = rds_tcp_listen_sock; |
| 179 | struct sock *sk; | 179 | struct sock *sk; |
| 180 | 180 | ||
| 181 | if (sock == NULL) | 181 | if (!sock) |
| 182 | return; | 182 | return; |
| 183 | 183 | ||
| 184 | sk = sock->sk; | 184 | sk = sock->sk; |
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 1aba6878fa5d..7017f3af80b6 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c | |||
| @@ -39,7 +39,7 @@ | |||
| 39 | 39 | ||
| 40 | static struct kmem_cache *rds_tcp_incoming_slab; | 40 | static struct kmem_cache *rds_tcp_incoming_slab; |
| 41 | 41 | ||
| 42 | void rds_tcp_inc_purge(struct rds_incoming *inc) | 42 | static void rds_tcp_inc_purge(struct rds_incoming *inc) |
| 43 | { | 43 | { |
| 44 | struct rds_tcp_incoming *tinc; | 44 | struct rds_tcp_incoming *tinc; |
| 45 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | 45 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); |
| @@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | |||
| 190 | * processing. | 190 | * processing. |
| 191 | */ | 191 | */ |
| 192 | while (left) { | 192 | while (left) { |
| 193 | if (tinc == NULL) { | 193 | if (!tinc) { |
| 194 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, | 194 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, |
| 195 | arg->gfp); | 195 | arg->gfp); |
| 196 | if (tinc == NULL) { | 196 | if (!tinc) { |
| 197 | desc->error = -ENOMEM; | 197 | desc->error = -ENOMEM; |
| 198 | goto out; | 198 | goto out; |
| 199 | } | 199 | } |
| @@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | |||
| 229 | 229 | ||
| 230 | if (left && tc->t_tinc_data_rem) { | 230 | if (left && tc->t_tinc_data_rem) { |
| 231 | clone = skb_clone(skb, arg->gfp); | 231 | clone = skb_clone(skb, arg->gfp); |
| 232 | if (clone == NULL) { | 232 | if (!clone) { |
| 233 | desc->error = -ENOMEM; | 233 | desc->error = -ENOMEM; |
| 234 | goto out; | 234 | goto out; |
| 235 | } | 235 | } |
| @@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) | |||
| 326 | 326 | ||
| 327 | read_lock(&sk->sk_callback_lock); | 327 | read_lock(&sk->sk_callback_lock); |
| 328 | conn = sk->sk_user_data; | 328 | conn = sk->sk_user_data; |
| 329 | if (conn == NULL) { /* check for teardown race */ | 329 | if (!conn) { /* check for teardown race */ |
| 330 | ready = sk->sk_data_ready; | 330 | ready = sk->sk_data_ready; |
| 331 | goto out; | 331 | goto out; |
| 332 | } | 332 | } |
| @@ -342,12 +342,12 @@ out: | |||
| 342 | ready(sk, bytes); | 342 | ready(sk, bytes); |
| 343 | } | 343 | } |
| 344 | 344 | ||
| 345 | int __init rds_tcp_recv_init(void) | 345 | int rds_tcp_recv_init(void) |
| 346 | { | 346 | { |
| 347 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", | 347 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", |
| 348 | sizeof(struct rds_tcp_incoming), | 348 | sizeof(struct rds_tcp_incoming), |
| 349 | 0, 0, NULL); | 349 | 0, 0, NULL); |
| 350 | if (rds_tcp_incoming_slab == NULL) | 350 | if (!rds_tcp_incoming_slab) |
| 351 | return -ENOMEM; | 351 | return -ENOMEM; |
| 352 | return 0; | 352 | return 0; |
| 353 | } | 353 | } |
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index a28b895ff0d1..2979fb4a4b9a 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c | |||
| @@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) | |||
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | /* the core send_sem serializes this with other xmit and shutdown */ | 79 | /* the core send_sem serializes this with other xmit and shutdown */ |
| 80 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
| 81 | struct rds_cong_map *map, unsigned long offset) | ||
| 82 | { | ||
| 83 | static struct rds_header rds_tcp_map_header = { | ||
| 84 | .h_flags = RDS_FLAG_CONG_BITMAP, | ||
| 85 | }; | ||
| 86 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 87 | unsigned long i; | ||
| 88 | int ret; | ||
| 89 | int copied = 0; | ||
| 90 | |||
| 91 | /* Some problem claims cpu_to_be32(constant) isn't a constant. */ | ||
| 92 | rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); | ||
| 93 | |||
| 94 | if (offset < sizeof(struct rds_header)) { | ||
| 95 | ret = rds_tcp_sendmsg(tc->t_sock, | ||
| 96 | (void *)&rds_tcp_map_header + offset, | ||
| 97 | sizeof(struct rds_header) - offset); | ||
| 98 | if (ret <= 0) | ||
| 99 | return ret; | ||
| 100 | offset += ret; | ||
| 101 | copied = ret; | ||
| 102 | if (offset < sizeof(struct rds_header)) | ||
| 103 | return ret; | ||
| 104 | } | ||
| 105 | |||
| 106 | offset -= sizeof(struct rds_header); | ||
| 107 | i = offset / PAGE_SIZE; | ||
| 108 | offset = offset % PAGE_SIZE; | ||
| 109 | BUG_ON(i >= RDS_CONG_MAP_PAGES); | ||
| 110 | |||
| 111 | do { | ||
| 112 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | ||
| 113 | virt_to_page(map->m_page_addrs[i]), | ||
| 114 | offset, PAGE_SIZE - offset, | ||
| 115 | MSG_DONTWAIT); | ||
| 116 | if (ret <= 0) | ||
| 117 | break; | ||
| 118 | copied += ret; | ||
| 119 | offset += ret; | ||
| 120 | if (offset == PAGE_SIZE) { | ||
| 121 | offset = 0; | ||
| 122 | i++; | ||
| 123 | } | ||
| 124 | } while (i < RDS_CONG_MAP_PAGES); | ||
| 125 | |||
| 126 | return copied ? copied : ret; | ||
| 127 | } | ||
| 128 | |||
| 129 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
| 130 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | 80 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, |
| 131 | unsigned int hdr_off, unsigned int sg, unsigned int off) | 81 | unsigned int hdr_off, unsigned int sg, unsigned int off) |
| 132 | { | 82 | { |
| @@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 166 | goto out; | 116 | goto out; |
| 167 | } | 117 | } |
| 168 | 118 | ||
| 169 | while (sg < rm->m_nents) { | 119 | while (sg < rm->data.op_nents) { |
| 170 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | 120 | ret = tc->t_sock->ops->sendpage(tc->t_sock, |
| 171 | sg_page(&rm->m_sg[sg]), | 121 | sg_page(&rm->data.op_sg[sg]), |
| 172 | rm->m_sg[sg].offset + off, | 122 | rm->data.op_sg[sg].offset + off, |
| 173 | rm->m_sg[sg].length - off, | 123 | rm->data.op_sg[sg].length - off, |
| 174 | MSG_DONTWAIT|MSG_NOSIGNAL); | 124 | MSG_DONTWAIT|MSG_NOSIGNAL); |
| 175 | rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), | 125 | rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), |
| 176 | rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, | 126 | rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, |
| 177 | ret); | 127 | ret); |
| 178 | if (ret <= 0) | 128 | if (ret <= 0) |
| 179 | break; | 129 | break; |
| 180 | 130 | ||
| 181 | off += ret; | 131 | off += ret; |
| 182 | done += ret; | 132 | done += ret; |
| 183 | if (off == rm->m_sg[sg].length) { | 133 | if (off == rm->data.op_sg[sg].length) { |
| 184 | off = 0; | 134 | off = 0; |
| 185 | sg++; | 135 | sg++; |
| 186 | } | 136 | } |
| @@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk) | |||
| 226 | 176 | ||
| 227 | read_lock(&sk->sk_callback_lock); | 177 | read_lock(&sk->sk_callback_lock); |
| 228 | conn = sk->sk_user_data; | 178 | conn = sk->sk_user_data; |
| 229 | if (conn == NULL) { | 179 | if (!conn) { |
| 230 | write_space = sk->sk_write_space; | 180 | write_space = sk->sk_write_space; |
| 231 | goto out; | 181 | goto out; |
| 232 | } | 182 | } |
diff --git a/net/rds/threads.c b/net/rds/threads.c index 786c20eaaf5e..0fd90f8c5f59 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c | |||
| @@ -61,7 +61,7 @@ | |||
| 61 | * | 61 | * |
| 62 | * Transition to state DISCONNECTING/DOWN: | 62 | * Transition to state DISCONNECTING/DOWN: |
| 63 | * - Inside the shutdown worker; synchronizes with xmit path | 63 | * - Inside the shutdown worker; synchronizes with xmit path |
| 64 | * through c_send_lock, and with connection management callbacks | 64 | * through RDS_IN_XMIT, and with connection management callbacks |
| 65 | * via c_cm_lock. | 65 | * via c_cm_lock. |
| 66 | * | 66 | * |
| 67 | * For receive callbacks, we rely on the underlying transport | 67 | * For receive callbacks, we rely on the underlying transport |
| @@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); | |||
| 110 | * We should *always* start with a random backoff; otherwise a broken connection | 110 | * We should *always* start with a random backoff; otherwise a broken connection |
| 111 | * will always take several iterations to be re-established. | 111 | * will always take several iterations to be re-established. |
| 112 | */ | 112 | */ |
| 113 | static void rds_queue_reconnect(struct rds_connection *conn) | 113 | void rds_queue_reconnect(struct rds_connection *conn) |
| 114 | { | 114 | { |
| 115 | unsigned long rand; | 115 | unsigned long rand; |
| 116 | 116 | ||
| @@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work) | |||
| 156 | } | 156 | } |
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | void rds_shutdown_worker(struct work_struct *work) | ||
| 160 | { | ||
| 161 | struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); | ||
| 162 | |||
| 163 | /* shut it down unless it's down already */ | ||
| 164 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { | ||
| 165 | /* | ||
| 166 | * Quiesce the connection mgmt handlers before we start tearing | ||
| 167 | * things down. We don't hold the mutex for the entire | ||
| 168 | * duration of the shutdown operation, else we may be | ||
| 169 | * deadlocking with the CM handler. Instead, the CM event | ||
| 170 | * handler is supposed to check for state DISCONNECTING | ||
| 171 | */ | ||
| 172 | mutex_lock(&conn->c_cm_lock); | ||
| 173 | if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && | ||
| 174 | !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { | ||
| 175 | rds_conn_error(conn, "shutdown called in state %d\n", | ||
| 176 | atomic_read(&conn->c_state)); | ||
| 177 | mutex_unlock(&conn->c_cm_lock); | ||
| 178 | return; | ||
| 179 | } | ||
| 180 | mutex_unlock(&conn->c_cm_lock); | ||
| 181 | |||
| 182 | mutex_lock(&conn->c_send_lock); | ||
| 183 | conn->c_trans->conn_shutdown(conn); | ||
| 184 | rds_conn_reset(conn); | ||
| 185 | mutex_unlock(&conn->c_send_lock); | ||
| 186 | |||
| 187 | if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { | ||
| 188 | /* This can happen - eg when we're in the middle of tearing | ||
| 189 | * down the connection, and someone unloads the rds module. | ||
| 190 | * Quite reproduceable with loopback connections. | ||
| 191 | * Mostly harmless. | ||
| 192 | */ | ||
| 193 | rds_conn_error(conn, | ||
| 194 | "%s: failed to transition to state DOWN, " | ||
| 195 | "current state is %d\n", | ||
| 196 | __func__, | ||
| 197 | atomic_read(&conn->c_state)); | ||
| 198 | return; | ||
| 199 | } | ||
| 200 | } | ||
| 201 | |||
| 202 | /* Then reconnect if it's still live. | ||
| 203 | * The passive side of an IB loopback connection is never added | ||
| 204 | * to the conn hash, so we never trigger a reconnect on this | ||
| 205 | * conn - the reconnect is always triggered by the active peer. */ | ||
| 206 | cancel_delayed_work(&conn->c_conn_w); | ||
| 207 | if (!hlist_unhashed(&conn->c_hash_node)) | ||
| 208 | rds_queue_reconnect(conn); | ||
| 209 | } | ||
| 210 | |||
| 211 | void rds_send_worker(struct work_struct *work) | 159 | void rds_send_worker(struct work_struct *work) |
| 212 | { | 160 | { |
| 213 | struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); | 161 | struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); |
| @@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work) | |||
| 252 | } | 200 | } |
| 253 | } | 201 | } |
| 254 | 202 | ||
| 203 | void rds_shutdown_worker(struct work_struct *work) | ||
| 204 | { | ||
| 205 | struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); | ||
| 206 | |||
| 207 | rds_conn_shutdown(conn); | ||
| 208 | } | ||
| 209 | |||
| 255 | void rds_threads_exit(void) | 210 | void rds_threads_exit(void) |
| 256 | { | 211 | { |
| 257 | destroy_workqueue(rds_wq); | 212 | destroy_workqueue(rds_wq); |
| 258 | } | 213 | } |
| 259 | 214 | ||
| 260 | int __init rds_threads_init(void) | 215 | int rds_threads_init(void) |
| 261 | { | 216 | { |
| 262 | rds_wq = create_workqueue("krdsd"); | 217 | rds_wq = create_singlethread_workqueue("krdsd"); |
| 263 | if (rds_wq == NULL) | 218 | if (!rds_wq) |
| 264 | return -ENOMEM; | 219 | return -ENOMEM; |
| 265 | 220 | ||
| 266 | return 0; | 221 | return 0; |
diff --git a/net/rds/transport.c b/net/rds/transport.c index 7e1067901353..7f2ac4fec367 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c | |||
| @@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans) | |||
| 71 | } | 71 | } |
| 72 | EXPORT_SYMBOL_GPL(rds_trans_unregister); | 72 | EXPORT_SYMBOL_GPL(rds_trans_unregister); |
| 73 | 73 | ||
| 74 | void rds_trans_put(struct rds_transport *trans) | ||
| 75 | { | ||
| 76 | if (trans && trans->t_owner) | ||
| 77 | module_put(trans->t_owner); | ||
| 78 | } | ||
| 79 | |||
| 74 | struct rds_transport *rds_trans_get_preferred(__be32 addr) | 80 | struct rds_transport *rds_trans_get_preferred(__be32 addr) |
| 75 | { | 81 | { |
| 76 | struct rds_transport *ret = NULL; | 82 | struct rds_transport *ret = NULL; |
| 77 | int i; | 83 | struct rds_transport *trans; |
| 84 | unsigned int i; | ||
| 78 | 85 | ||
| 79 | if (IN_LOOPBACK(ntohl(addr))) | 86 | if (IN_LOOPBACK(ntohl(addr))) |
| 80 | return &rds_loop_transport; | 87 | return &rds_loop_transport; |
| 81 | 88 | ||
| 82 | down_read(&rds_trans_sem); | 89 | down_read(&rds_trans_sem); |
| 83 | for (i = 0; i < RDS_TRANS_COUNT; i++) | 90 | for (i = 0; i < RDS_TRANS_COUNT; i++) { |
| 84 | { | 91 | trans = transports[i]; |
| 85 | if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { | 92 | |
| 86 | ret = transports[i]; | 93 | if (trans && (trans->laddr_check(addr) == 0) && |
| 94 | (!trans->t_owner || try_module_get(trans->t_owner))) { | ||
| 95 | ret = trans; | ||
| 87 | break; | 96 | break; |
| 88 | } | 97 | } |
| 89 | } | 98 | } |
diff --git a/net/rds/xlist.h b/net/rds/xlist.h new file mode 100644 index 000000000000..e6b5190daddd --- /dev/null +++ b/net/rds/xlist.h | |||
| @@ -0,0 +1,80 @@ | |||
| 1 | #ifndef _LINUX_XLIST_H | ||
| 2 | #define _LINUX_XLIST_H | ||
| 3 | |||
| 4 | #include <linux/stddef.h> | ||
| 5 | #include <linux/poison.h> | ||
| 6 | #include <linux/prefetch.h> | ||
| 7 | #include <asm/system.h> | ||
| 8 | |||
| 9 | struct xlist_head { | ||
| 10 | struct xlist_head *next; | ||
| 11 | }; | ||
| 12 | |||
| 13 | static inline void INIT_XLIST_HEAD(struct xlist_head *list) | ||
| 14 | { | ||
| 15 | list->next = NULL; | ||
| 16 | } | ||
| 17 | |||
| 18 | static inline int xlist_empty(struct xlist_head *head) | ||
| 19 | { | ||
| 20 | return head->next == NULL; | ||
| 21 | } | ||
| 22 | |||
| 23 | static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, | ||
| 24 | struct xlist_head *head) | ||
| 25 | { | ||
| 26 | struct xlist_head *cur; | ||
| 27 | struct xlist_head *check; | ||
| 28 | |||
| 29 | while (1) { | ||
| 30 | cur = head->next; | ||
| 31 | tail->next = cur; | ||
| 32 | check = cmpxchg(&head->next, cur, new); | ||
| 33 | if (check == cur) | ||
| 34 | break; | ||
| 35 | } | ||
| 36 | } | ||
| 37 | |||
| 38 | static inline struct xlist_head *xlist_del_head(struct xlist_head *head) | ||
| 39 | { | ||
| 40 | struct xlist_head *cur; | ||
| 41 | struct xlist_head *check; | ||
| 42 | struct xlist_head *next; | ||
| 43 | |||
| 44 | while (1) { | ||
| 45 | cur = head->next; | ||
| 46 | if (!cur) | ||
| 47 | goto out; | ||
| 48 | |||
| 49 | next = cur->next; | ||
| 50 | check = cmpxchg(&head->next, cur, next); | ||
| 51 | if (check == cur) | ||
| 52 | goto out; | ||
| 53 | } | ||
| 54 | out: | ||
| 55 | return cur; | ||
| 56 | } | ||
| 57 | |||
| 58 | static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head) | ||
| 59 | { | ||
| 60 | struct xlist_head *cur; | ||
| 61 | |||
| 62 | cur = head->next; | ||
| 63 | if (!cur) | ||
| 64 | return NULL; | ||
| 65 | |||
| 66 | head->next = cur->next; | ||
| 67 | return cur; | ||
| 68 | } | ||
| 69 | |||
| 70 | static inline void xlist_splice(struct xlist_head *list, | ||
| 71 | struct xlist_head *head) | ||
| 72 | { | ||
| 73 | struct xlist_head *cur; | ||
| 74 | |||
| 75 | WARN_ON(head->next); | ||
| 76 | cur = xchg(&list->next, NULL); | ||
| 77 | head->next = cur; | ||
| 78 | } | ||
| 79 | |||
| 80 | #endif | ||
