diff options
author | Andy Grover <andy.grover@oracle.com> | 2009-02-24 10:30:30 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-02-27 02:39:30 -0500 |
commit | ec16227e14141e4fd7ae76354c09dadfe2449d9e (patch) | |
tree | 9a20de7d9a0688d721517c2b38d6be83956c7ca4 /net/rds/ib.h | |
parent | eff5f53bef75c0864a5da06bb688939092b848dc (diff) |
RDS/IB: Infiniband transport
Registers as an RDS transport and an IB client, and uses IB CM
API to allocate ids, queue pairs, and the rest of that fun stuff.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds/ib.h')
-rw-r--r-- | net/rds/ib.h | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/net/rds/ib.h b/net/rds/ib.h new file mode 100644 index 000000000000..8be563a1363a --- /dev/null +++ b/net/rds/ib.h | |||
@@ -0,0 +1,367 @@ | |||
1 | #ifndef _RDS_IB_H | ||
2 | #define _RDS_IB_H | ||
3 | |||
4 | #include <rdma/ib_verbs.h> | ||
5 | #include <rdma/rdma_cm.h> | ||
6 | #include "rds.h" | ||
7 | #include "rdma_transport.h" | ||
8 | |||
9 | #define RDS_FMR_SIZE 256 | ||
10 | #define RDS_FMR_POOL_SIZE 4096 | ||
11 | |||
12 | #define RDS_IB_MAX_SGE 8 | ||
13 | #define RDS_IB_RECV_SGE 2 | ||
14 | |||
15 | #define RDS_IB_DEFAULT_RECV_WR 1024 | ||
16 | #define RDS_IB_DEFAULT_SEND_WR 256 | ||
17 | |||
18 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ | ||
19 | |||
20 | extern struct list_head rds_ib_devices; | ||
21 | |||
22 | /* | ||
23 | * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to | ||
24 | * try and minimize the amount of memory tied up both the device and | ||
25 | * socket receive queues. | ||
26 | */ | ||
27 | /* page offset of the final full frag that fits in the page */ | ||
28 | #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) | ||
29 | struct rds_page_frag { | ||
30 | struct list_head f_item; | ||
31 | struct page *f_page; | ||
32 | unsigned long f_offset; | ||
33 | dma_addr_t f_mapped; | ||
34 | }; | ||
35 | |||
36 | struct rds_ib_incoming { | ||
37 | struct list_head ii_frags; | ||
38 | struct rds_incoming ii_inc; | ||
39 | }; | ||
40 | |||
41 | struct rds_ib_connect_private { | ||
42 | /* Add new fields at the end, and don't permute existing fields. */ | ||
43 | __be32 dp_saddr; | ||
44 | __be32 dp_daddr; | ||
45 | u8 dp_protocol_major; | ||
46 | u8 dp_protocol_minor; | ||
47 | __be16 dp_protocol_minor_mask; /* bitmask */ | ||
48 | __be32 dp_reserved1; | ||
49 | __be64 dp_ack_seq; | ||
50 | __be32 dp_credit; /* non-zero enables flow ctl */ | ||
51 | }; | ||
52 | |||
53 | struct rds_ib_send_work { | ||
54 | struct rds_message *s_rm; | ||
55 | struct rds_rdma_op *s_op; | ||
56 | struct ib_send_wr s_wr; | ||
57 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; | ||
58 | unsigned long s_queued; | ||
59 | }; | ||
60 | |||
61 | struct rds_ib_recv_work { | ||
62 | struct rds_ib_incoming *r_ibinc; | ||
63 | struct rds_page_frag *r_frag; | ||
64 | struct ib_recv_wr r_wr; | ||
65 | struct ib_sge r_sge[2]; | ||
66 | }; | ||
67 | |||
68 | struct rds_ib_work_ring { | ||
69 | u32 w_nr; | ||
70 | u32 w_alloc_ptr; | ||
71 | u32 w_alloc_ctr; | ||
72 | u32 w_free_ptr; | ||
73 | atomic_t w_free_ctr; | ||
74 | }; | ||
75 | |||
76 | struct rds_ib_device; | ||
77 | |||
78 | struct rds_ib_connection { | ||
79 | |||
80 | struct list_head ib_node; | ||
81 | struct rds_ib_device *rds_ibdev; | ||
82 | struct rds_connection *conn; | ||
83 | |||
84 | /* alphabet soup, IBTA style */ | ||
85 | struct rdma_cm_id *i_cm_id; | ||
86 | struct ib_pd *i_pd; | ||
87 | struct ib_mr *i_mr; | ||
88 | struct ib_cq *i_send_cq; | ||
89 | struct ib_cq *i_recv_cq; | ||
90 | |||
91 | /* tx */ | ||
92 | struct rds_ib_work_ring i_send_ring; | ||
93 | struct rds_message *i_rm; | ||
94 | struct rds_header *i_send_hdrs; | ||
95 | u64 i_send_hdrs_dma; | ||
96 | struct rds_ib_send_work *i_sends; | ||
97 | |||
98 | /* rx */ | ||
99 | struct mutex i_recv_mutex; | ||
100 | struct rds_ib_work_ring i_recv_ring; | ||
101 | struct rds_ib_incoming *i_ibinc; | ||
102 | u32 i_recv_data_rem; | ||
103 | struct rds_header *i_recv_hdrs; | ||
104 | u64 i_recv_hdrs_dma; | ||
105 | struct rds_ib_recv_work *i_recvs; | ||
106 | struct rds_page_frag i_frag; | ||
107 | u64 i_ack_recv; /* last ACK received */ | ||
108 | |||
109 | /* sending acks */ | ||
110 | unsigned long i_ack_flags; | ||
111 | u64 i_ack_next; /* next ACK to send */ | ||
112 | struct rds_header *i_ack; | ||
113 | struct ib_send_wr i_ack_wr; | ||
114 | struct ib_sge i_ack_sge; | ||
115 | u64 i_ack_dma; | ||
116 | unsigned long i_ack_queued; | ||
117 | |||
118 | /* Flow control related information | ||
119 | * | ||
120 | * Our algorithm uses a pair variables that we need to access | ||
121 | * atomically - one for the send credits, and one posted | ||
122 | * recv credits we need to transfer to remote. | ||
123 | * Rather than protect them using a slow spinlock, we put both into | ||
124 | * a single atomic_t and update it using cmpxchg | ||
125 | */ | ||
126 | atomic_t i_credits; | ||
127 | |||
128 | /* Protocol version specific information */ | ||
129 | unsigned int i_flowctl:1; /* enable/disable flow ctl */ | ||
130 | |||
131 | /* Batched completions */ | ||
132 | unsigned int i_unsignaled_wrs; | ||
133 | long i_unsignaled_bytes; | ||
134 | }; | ||
135 | |||
136 | /* This assumes that atomic_t is at least 32 bits */ | ||
137 | #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) | ||
138 | #define IB_GET_POST_CREDITS(v) ((v) >> 16) | ||
139 | #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) | ||
140 | #define IB_SET_POST_CREDITS(v) ((v) << 16) | ||
141 | |||
142 | struct rds_ib_ipaddr { | ||
143 | struct list_head list; | ||
144 | __be32 ipaddr; | ||
145 | }; | ||
146 | |||
147 | struct rds_ib_device { | ||
148 | struct list_head list; | ||
149 | struct list_head ipaddr_list; | ||
150 | struct list_head conn_list; | ||
151 | struct ib_device *dev; | ||
152 | struct ib_pd *pd; | ||
153 | struct ib_mr *mr; | ||
154 | struct rds_ib_mr_pool *mr_pool; | ||
155 | int fmr_page_shift; | ||
156 | int fmr_page_size; | ||
157 | u64 fmr_page_mask; | ||
158 | unsigned int fmr_max_remaps; | ||
159 | unsigned int max_fmrs; | ||
160 | int max_sge; | ||
161 | unsigned int max_wrs; | ||
162 | spinlock_t spinlock; /* protect the above */ | ||
163 | }; | ||
164 | |||
165 | /* bits for i_ack_flags */ | ||
166 | #define IB_ACK_IN_FLIGHT 0 | ||
167 | #define IB_ACK_REQUESTED 1 | ||
168 | |||
169 | /* Magic WR_ID for ACKs */ | ||
170 | #define RDS_IB_ACK_WR_ID (~(u64) 0) | ||
171 | |||
172 | struct rds_ib_statistics { | ||
173 | uint64_t s_ib_connect_raced; | ||
174 | uint64_t s_ib_listen_closed_stale; | ||
175 | uint64_t s_ib_tx_cq_call; | ||
176 | uint64_t s_ib_tx_cq_event; | ||
177 | uint64_t s_ib_tx_ring_full; | ||
178 | uint64_t s_ib_tx_throttle; | ||
179 | uint64_t s_ib_tx_sg_mapping_failure; | ||
180 | uint64_t s_ib_tx_stalled; | ||
181 | uint64_t s_ib_tx_credit_updates; | ||
182 | uint64_t s_ib_rx_cq_call; | ||
183 | uint64_t s_ib_rx_cq_event; | ||
184 | uint64_t s_ib_rx_ring_empty; | ||
185 | uint64_t s_ib_rx_refill_from_cq; | ||
186 | uint64_t s_ib_rx_refill_from_thread; | ||
187 | uint64_t s_ib_rx_alloc_limit; | ||
188 | uint64_t s_ib_rx_credit_updates; | ||
189 | uint64_t s_ib_ack_sent; | ||
190 | uint64_t s_ib_ack_send_failure; | ||
191 | uint64_t s_ib_ack_send_delayed; | ||
192 | uint64_t s_ib_ack_send_piggybacked; | ||
193 | uint64_t s_ib_ack_received; | ||
194 | uint64_t s_ib_rdma_mr_alloc; | ||
195 | uint64_t s_ib_rdma_mr_free; | ||
196 | uint64_t s_ib_rdma_mr_used; | ||
197 | uint64_t s_ib_rdma_mr_pool_flush; | ||
198 | uint64_t s_ib_rdma_mr_pool_wait; | ||
199 | uint64_t s_ib_rdma_mr_pool_depleted; | ||
200 | }; | ||
201 | |||
202 | extern struct workqueue_struct *rds_ib_wq; | ||
203 | |||
204 | /* | ||
205 | * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h | ||
206 | * doesn't define it. | ||
207 | */ | ||
208 | static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, | ||
209 | struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||
210 | { | ||
211 | unsigned int i; | ||
212 | |||
213 | for (i = 0; i < sg_dma_len; ++i) { | ||
214 | ib_dma_sync_single_for_cpu(dev, | ||
215 | ib_sg_dma_address(dev, &sg[i]), | ||
216 | ib_sg_dma_len(dev, &sg[i]), | ||
217 | direction); | ||
218 | } | ||
219 | } | ||
220 | #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu | ||
221 | |||
222 | static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, | ||
223 | struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||
224 | { | ||
225 | unsigned int i; | ||
226 | |||
227 | for (i = 0; i < sg_dma_len; ++i) { | ||
228 | ib_dma_sync_single_for_device(dev, | ||
229 | ib_sg_dma_address(dev, &sg[i]), | ||
230 | ib_sg_dma_len(dev, &sg[i]), | ||
231 | direction); | ||
232 | } | ||
233 | } | ||
234 | #define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device | ||
235 | |||
236 | |||
237 | /* ib.c */ | ||
238 | extern struct rds_transport rds_ib_transport; | ||
239 | extern void rds_ib_add_one(struct ib_device *device); | ||
240 | extern void rds_ib_remove_one(struct ib_device *device); | ||
241 | extern struct ib_client rds_ib_client; | ||
242 | |||
243 | extern unsigned int fmr_pool_size; | ||
244 | extern unsigned int fmr_message_size; | ||
245 | |||
246 | extern spinlock_t ib_nodev_conns_lock; | ||
247 | extern struct list_head ib_nodev_conns; | ||
248 | |||
249 | /* ib_cm.c */ | ||
250 | int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); | ||
251 | void rds_ib_conn_free(void *arg); | ||
252 | int rds_ib_conn_connect(struct rds_connection *conn); | ||
253 | void rds_ib_conn_shutdown(struct rds_connection *conn); | ||
254 | void rds_ib_state_change(struct sock *sk); | ||
255 | int __init rds_ib_listen_init(void); | ||
256 | void rds_ib_listen_stop(void); | ||
257 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); | ||
258 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | ||
259 | struct rdma_cm_event *event); | ||
260 | int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); | ||
261 | void rds_ib_cm_connect_complete(struct rds_connection *conn, | ||
262 | struct rdma_cm_event *event); | ||
263 | |||
264 | |||
265 | #define rds_ib_conn_error(conn, fmt...) \ | ||
266 | __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) | ||
267 | |||
268 | /* ib_rdma.c */ | ||
269 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); | ||
270 | int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | ||
271 | void rds_ib_remove_nodev_conns(void); | ||
272 | void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev); | ||
273 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); | ||
274 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); | ||
275 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | ||
276 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||
277 | struct rds_sock *rs, u32 *key_ret); | ||
278 | void rds_ib_sync_mr(void *trans_private, int dir); | ||
279 | void rds_ib_free_mr(void *trans_private, int invalidate); | ||
280 | void rds_ib_flush_mrs(void); | ||
281 | |||
282 | /* ib_recv.c */ | ||
283 | int __init rds_ib_recv_init(void); | ||
284 | void rds_ib_recv_exit(void); | ||
285 | int rds_ib_recv(struct rds_connection *conn); | ||
286 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | ||
287 | gfp_t page_gfp, int prefill); | ||
288 | void rds_ib_inc_purge(struct rds_incoming *inc); | ||
289 | void rds_ib_inc_free(struct rds_incoming *inc); | ||
290 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | ||
291 | size_t size); | ||
292 | void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); | ||
293 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic); | ||
294 | void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); | ||
295 | void rds_ib_recv_init_ack(struct rds_ib_connection *ic); | ||
296 | void rds_ib_attempt_ack(struct rds_ib_connection *ic); | ||
297 | void rds_ib_ack_send_complete(struct rds_ib_connection *ic); | ||
298 | u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); | ||
299 | |||
300 | /* ib_ring.c */ | ||
301 | void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); | ||
302 | void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); | ||
303 | u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); | ||
304 | void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); | ||
305 | void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); | ||
306 | int rds_ib_ring_empty(struct rds_ib_work_ring *ring); | ||
307 | int rds_ib_ring_low(struct rds_ib_work_ring *ring); | ||
308 | u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); | ||
309 | u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); | ||
310 | extern wait_queue_head_t rds_ib_ring_empty_wait; | ||
311 | |||
312 | /* ib_send.c */ | ||
313 | void rds_ib_xmit_complete(struct rds_connection *conn); | ||
314 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
315 | unsigned int hdr_off, unsigned int sg, unsigned int off); | ||
316 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); | ||
317 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); | ||
318 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); | ||
319 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | ||
320 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); | ||
321 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); | ||
322 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, | ||
323 | u32 *adv_credits, int need_posted); | ||
324 | |||
325 | /* ib_stats.c */ | ||
326 | DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); | ||
327 | #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) | ||
328 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | ||
329 | unsigned int avail); | ||
330 | |||
331 | /* ib_sysctl.c */ | ||
332 | int __init rds_ib_sysctl_init(void); | ||
333 | void rds_ib_sysctl_exit(void); | ||
334 | extern unsigned long rds_ib_sysctl_max_send_wr; | ||
335 | extern unsigned long rds_ib_sysctl_max_recv_wr; | ||
336 | extern unsigned long rds_ib_sysctl_max_unsig_wrs; | ||
337 | extern unsigned long rds_ib_sysctl_max_unsig_bytes; | ||
338 | extern unsigned long rds_ib_sysctl_max_recv_allocation; | ||
339 | extern unsigned int rds_ib_sysctl_flow_control; | ||
340 | extern ctl_table rds_ib_sysctl_table[]; | ||
341 | |||
342 | /* | ||
343 | * Helper functions for getting/setting the header and data SGEs in | ||
344 | * RDS packets (not RDMA) | ||
345 | */ | ||
346 | static inline struct ib_sge * | ||
347 | rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
348 | { | ||
349 | return &sge[0]; | ||
350 | } | ||
351 | |||
352 | static inline struct ib_sge * | ||
353 | rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
354 | { | ||
355 | return &sge[1]; | ||
356 | } | ||
357 | |||
358 | static inline void rds_ib_set_64bit(u64 *ptr, u64 val) | ||
359 | { | ||
360 | #if BITS_PER_LONG == 64 | ||
361 | *ptr = val; | ||
362 | #else | ||
363 | set_64bit(ptr, val); | ||
364 | #endif | ||
365 | } | ||
366 | |||
367 | #endif | ||