diff options
author | Philipp Reisner <philipp.reisner@linbit.com> | 2009-09-25 19:07:19 -0400 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2009-10-01 15:17:49 -0400 |
commit | b411b3637fa71fce9cf2acf0639009500f5892fe (patch) | |
tree | 6b88e5202e0f137fef50e95b0441bcafdbf91990 /include | |
parent | 1a35e0f6443f4266dad4c569c55c57a9032596fa (diff) |
The DRBD driver
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/drbd.h | 349 | ||||
-rw-r--r-- | include/linux/drbd_limits.h | 137 | ||||
-rw-r--r-- | include/linux/drbd_nl.h | 137 | ||||
-rw-r--r-- | include/linux/drbd_tag_magic.h | 83 | ||||
-rw-r--r-- | include/linux/lru_cache.h | 294 |
5 files changed, 1000 insertions, 0 deletions
diff --git a/include/linux/drbd.h b/include/linux/drbd.h new file mode 100644 index 000000000000..69dc711f37b3 --- /dev/null +++ b/include/linux/drbd.h | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | drbd.h | ||
3 | Kernel module for 2.6.x Kernels | ||
4 | |||
5 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
6 | |||
7 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
8 | Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
9 | Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
10 | |||
11 | drbd is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2, or (at your option) | ||
14 | any later version. | ||
15 | |||
16 | drbd is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with drbd; see the file COPYING. If not, write to | ||
23 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
24 | |||
25 | */ | ||
26 | #ifndef DRBD_H | ||
27 | #define DRBD_H | ||
28 | #include <linux/connector.h> | ||
29 | #include <asm/types.h> | ||
30 | |||
31 | #ifdef __KERNEL__ | ||
32 | #include <linux/types.h> | ||
33 | #include <asm/byteorder.h> | ||
34 | #else | ||
35 | #include <sys/types.h> | ||
36 | #include <sys/wait.h> | ||
37 | #include <limits.h> | ||
38 | |||
39 | /* Altough the Linux source code makes a difference between | ||
40 | generic endianness and the bitfields' endianness, there is no | ||
41 | architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness | ||
42 | does not match the generic endianness. */ | ||
43 | |||
44 | #if __BYTE_ORDER == __LITTLE_ENDIAN | ||
45 | #define __LITTLE_ENDIAN_BITFIELD | ||
46 | #elif __BYTE_ORDER == __BIG_ENDIAN | ||
47 | #define __BIG_ENDIAN_BITFIELD | ||
48 | #else | ||
49 | # error "sorry, weird endianness on this box" | ||
50 | #endif | ||
51 | |||
52 | #endif | ||
53 | |||
54 | |||
55 | extern const char *drbd_buildtag(void); | ||
56 | #define REL_VERSION "8.3.3rc2" | ||
57 | #define API_VERSION 88 | ||
58 | #define PRO_VERSION_MIN 86 | ||
59 | #define PRO_VERSION_MAX 91 | ||
60 | |||
61 | |||
62 | enum drbd_io_error_p { | ||
63 | EP_PASS_ON, /* FIXME should the better be named "Ignore"? */ | ||
64 | EP_CALL_HELPER, | ||
65 | EP_DETACH | ||
66 | }; | ||
67 | |||
68 | enum drbd_fencing_p { | ||
69 | FP_DONT_CARE, | ||
70 | FP_RESOURCE, | ||
71 | FP_STONITH | ||
72 | }; | ||
73 | |||
74 | enum drbd_disconnect_p { | ||
75 | DP_RECONNECT, | ||
76 | DP_DROP_NET_CONF, | ||
77 | DP_FREEZE_IO | ||
78 | }; | ||
79 | |||
80 | enum drbd_after_sb_p { | ||
81 | ASB_DISCONNECT, | ||
82 | ASB_DISCARD_YOUNGER_PRI, | ||
83 | ASB_DISCARD_OLDER_PRI, | ||
84 | ASB_DISCARD_ZERO_CHG, | ||
85 | ASB_DISCARD_LEAST_CHG, | ||
86 | ASB_DISCARD_LOCAL, | ||
87 | ASB_DISCARD_REMOTE, | ||
88 | ASB_CONSENSUS, | ||
89 | ASB_DISCARD_SECONDARY, | ||
90 | ASB_CALL_HELPER, | ||
91 | ASB_VIOLENTLY | ||
92 | }; | ||
93 | |||
94 | /* KEEP the order, do not delete or insert. Only append. */ | ||
95 | enum drbd_ret_codes { | ||
96 | ERR_CODE_BASE = 100, | ||
97 | NO_ERROR = 101, | ||
98 | ERR_LOCAL_ADDR = 102, | ||
99 | ERR_PEER_ADDR = 103, | ||
100 | ERR_OPEN_DISK = 104, | ||
101 | ERR_OPEN_MD_DISK = 105, | ||
102 | ERR_DISK_NOT_BDEV = 107, | ||
103 | ERR_MD_NOT_BDEV = 108, | ||
104 | ERR_DISK_TO_SMALL = 111, | ||
105 | ERR_MD_DISK_TO_SMALL = 112, | ||
106 | ERR_BDCLAIM_DISK = 114, | ||
107 | ERR_BDCLAIM_MD_DISK = 115, | ||
108 | ERR_MD_IDX_INVALID = 116, | ||
109 | ERR_IO_MD_DISK = 118, | ||
110 | ERR_MD_INVALID = 119, | ||
111 | ERR_AUTH_ALG = 120, | ||
112 | ERR_AUTH_ALG_ND = 121, | ||
113 | ERR_NOMEM = 122, | ||
114 | ERR_DISCARD = 123, | ||
115 | ERR_DISK_CONFIGURED = 124, | ||
116 | ERR_NET_CONFIGURED = 125, | ||
117 | ERR_MANDATORY_TAG = 126, | ||
118 | ERR_MINOR_INVALID = 127, | ||
119 | ERR_INTR = 129, /* EINTR */ | ||
120 | ERR_RESIZE_RESYNC = 130, | ||
121 | ERR_NO_PRIMARY = 131, | ||
122 | ERR_SYNC_AFTER = 132, | ||
123 | ERR_SYNC_AFTER_CYCLE = 133, | ||
124 | ERR_PAUSE_IS_SET = 134, | ||
125 | ERR_PAUSE_IS_CLEAR = 135, | ||
126 | ERR_PACKET_NR = 137, | ||
127 | ERR_NO_DISK = 138, | ||
128 | ERR_NOT_PROTO_C = 139, | ||
129 | ERR_NOMEM_BITMAP = 140, | ||
130 | ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */ | ||
131 | ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */ | ||
132 | ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */ | ||
133 | ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */ | ||
134 | ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */ | ||
135 | ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */ | ||
136 | ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */ | ||
137 | ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */ | ||
138 | ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */ | ||
139 | ERR_DATA_NOT_CURRENT = 150, | ||
140 | ERR_CONNECTED = 151, /* DRBD 8.3 only */ | ||
141 | |||
142 | /* insert new ones above this line */ | ||
143 | AFTER_LAST_ERR_CODE | ||
144 | }; | ||
145 | |||
146 | #define DRBD_PROT_A 1 | ||
147 | #define DRBD_PROT_B 2 | ||
148 | #define DRBD_PROT_C 3 | ||
149 | |||
150 | enum drbd_role { | ||
151 | R_UNKNOWN = 0, | ||
152 | R_PRIMARY = 1, /* role */ | ||
153 | R_SECONDARY = 2, /* role */ | ||
154 | R_MASK = 3, | ||
155 | }; | ||
156 | |||
157 | /* The order of these constants is important. | ||
158 | * The lower ones (<C_WF_REPORT_PARAMS) indicate | ||
159 | * that there is no socket! | ||
160 | * >=C_WF_REPORT_PARAMS ==> There is a socket | ||
161 | */ | ||
162 | enum drbd_conns { | ||
163 | C_STANDALONE, | ||
164 | C_DISCONNECTING, /* Temporal state on the way to StandAlone. */ | ||
165 | C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */ | ||
166 | |||
167 | /* These temporal states are all used on the way | ||
168 | * from >= C_CONNECTED to Unconnected. | ||
169 | * The 'disconnect reason' states | ||
170 | * I do not allow to change beween them. */ | ||
171 | C_TIMEOUT, | ||
172 | C_BROKEN_PIPE, | ||
173 | C_NETWORK_FAILURE, | ||
174 | C_PROTOCOL_ERROR, | ||
175 | C_TEAR_DOWN, | ||
176 | |||
177 | C_WF_CONNECTION, | ||
178 | C_WF_REPORT_PARAMS, /* we have a socket */ | ||
179 | C_CONNECTED, /* we have introduced each other */ | ||
180 | C_STARTING_SYNC_S, /* starting full sync by admin request. */ | ||
181 | C_STARTING_SYNC_T, /* stariing full sync by admin request. */ | ||
182 | C_WF_BITMAP_S, | ||
183 | C_WF_BITMAP_T, | ||
184 | C_WF_SYNC_UUID, | ||
185 | |||
186 | /* All SyncStates are tested with this comparison | ||
187 | * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */ | ||
188 | C_SYNC_SOURCE, | ||
189 | C_SYNC_TARGET, | ||
190 | C_VERIFY_S, | ||
191 | C_VERIFY_T, | ||
192 | C_PAUSED_SYNC_S, | ||
193 | C_PAUSED_SYNC_T, | ||
194 | C_MASK = 31 | ||
195 | }; | ||
196 | |||
197 | enum drbd_disk_state { | ||
198 | D_DISKLESS, | ||
199 | D_ATTACHING, /* In the process of reading the meta-data */ | ||
200 | D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */ | ||
201 | /* when >= D_FAILED it is legal to access mdev->bc */ | ||
202 | D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */ | ||
203 | D_INCONSISTENT, | ||
204 | D_OUTDATED, | ||
205 | D_UNKNOWN, /* Only used for the peer, never for myself */ | ||
206 | D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */ | ||
207 | D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */ | ||
208 | D_MASK = 15 | ||
209 | }; | ||
210 | |||
211 | union drbd_state { | ||
212 | /* According to gcc's docs is the ... | ||
213 | * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1). | ||
214 | * Determined by ABI. | ||
215 | * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com> | ||
216 | * even though we transmit as "cpu_to_be32(state)", | ||
217 | * the offsets of the bitfields still need to be swapped | ||
218 | * on different endianess. | ||
219 | */ | ||
220 | struct { | ||
221 | #if defined(__LITTLE_ENDIAN_BITFIELD) | ||
222 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
223 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
224 | unsigned conn:5 ; /* 17/32 cstates */ | ||
225 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
226 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
227 | unsigned susp:1 ; /* 2/2 IO suspended no/yes */ | ||
228 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
229 | unsigned peer_isp:1 ; | ||
230 | unsigned user_isp:1 ; | ||
231 | unsigned _pad:11; /* 0 unused */ | ||
232 | #elif defined(__BIG_ENDIAN_BITFIELD) | ||
233 | unsigned _pad:11; /* 0 unused */ | ||
234 | unsigned user_isp:1 ; | ||
235 | unsigned peer_isp:1 ; | ||
236 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
237 | unsigned susp:1 ; /* 2/2 IO suspended no/yes */ | ||
238 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
239 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
240 | unsigned conn:5 ; /* 17/32 cstates */ | ||
241 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
242 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
243 | #else | ||
244 | # error "this endianess is not supported" | ||
245 | #endif | ||
246 | }; | ||
247 | unsigned int i; | ||
248 | }; | ||
249 | |||
250 | enum drbd_state_ret_codes { | ||
251 | SS_CW_NO_NEED = 4, | ||
252 | SS_CW_SUCCESS = 3, | ||
253 | SS_NOTHING_TO_DO = 2, | ||
254 | SS_SUCCESS = 1, | ||
255 | SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */ | ||
256 | SS_TWO_PRIMARIES = -1, | ||
257 | SS_NO_UP_TO_DATE_DISK = -2, | ||
258 | SS_NO_LOCAL_DISK = -4, | ||
259 | SS_NO_REMOTE_DISK = -5, | ||
260 | SS_CONNECTED_OUTDATES = -6, | ||
261 | SS_PRIMARY_NOP = -7, | ||
262 | SS_RESYNC_RUNNING = -8, | ||
263 | SS_ALREADY_STANDALONE = -9, | ||
264 | SS_CW_FAILED_BY_PEER = -10, | ||
265 | SS_IS_DISKLESS = -11, | ||
266 | SS_DEVICE_IN_USE = -12, | ||
267 | SS_NO_NET_CONFIG = -13, | ||
268 | SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */ | ||
269 | SS_NEED_CONNECTION = -15, /* drbd-8.2 only */ | ||
270 | SS_LOWER_THAN_OUTDATED = -16, | ||
271 | SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ | ||
272 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ | ||
273 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ | ||
274 | SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ | ||
275 | }; | ||
276 | |||
277 | /* from drbd_strings.c */ | ||
278 | extern const char *drbd_conn_str(enum drbd_conns); | ||
279 | extern const char *drbd_role_str(enum drbd_role); | ||
280 | extern const char *drbd_disk_str(enum drbd_disk_state); | ||
281 | extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes); | ||
282 | |||
283 | #define SHARED_SECRET_MAX 64 | ||
284 | |||
285 | #define MDF_CONSISTENT (1 << 0) | ||
286 | #define MDF_PRIMARY_IND (1 << 1) | ||
287 | #define MDF_CONNECTED_IND (1 << 2) | ||
288 | #define MDF_FULL_SYNC (1 << 3) | ||
289 | #define MDF_WAS_UP_TO_DATE (1 << 4) | ||
290 | #define MDF_PEER_OUT_DATED (1 << 5) | ||
291 | #define MDF_CRASHED_PRIMARY (1 << 6) | ||
292 | |||
293 | enum drbd_uuid_index { | ||
294 | UI_CURRENT, | ||
295 | UI_BITMAP, | ||
296 | UI_HISTORY_START, | ||
297 | UI_HISTORY_END, | ||
298 | UI_SIZE, /* nl-packet: number of dirty bits */ | ||
299 | UI_FLAGS, /* nl-packet: flags */ | ||
300 | UI_EXTENDED_SIZE /* Everything. */ | ||
301 | }; | ||
302 | |||
303 | enum drbd_timeout_flag { | ||
304 | UT_DEFAULT = 0, | ||
305 | UT_DEGRADED = 1, | ||
306 | UT_PEER_OUTDATED = 2, | ||
307 | }; | ||
308 | |||
309 | #define UUID_JUST_CREATED ((__u64)4) | ||
310 | |||
311 | #define DRBD_MAGIC 0x83740267 | ||
312 | #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) | ||
313 | |||
314 | /* these are of type "int" */ | ||
315 | #define DRBD_MD_INDEX_INTERNAL -1 | ||
316 | #define DRBD_MD_INDEX_FLEX_EXT -2 | ||
317 | #define DRBD_MD_INDEX_FLEX_INT -3 | ||
318 | |||
319 | /* Start of the new netlink/connector stuff */ | ||
320 | |||
321 | #define DRBD_NL_CREATE_DEVICE 0x01 | ||
322 | #define DRBD_NL_SET_DEFAULTS 0x02 | ||
323 | |||
324 | /* The following line should be moved over to linux/connector.h | ||
325 | * when the time comes */ | ||
326 | #ifndef CN_IDX_DRBD | ||
327 | # define CN_IDX_DRBD 0x4 | ||
328 | /* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */ | ||
329 | #endif | ||
330 | #define CN_VAL_DRBD 0x1 | ||
331 | |||
332 | /* For searching a vacant cn_idx value */ | ||
333 | #define CN_IDX_STEP 6977 | ||
334 | |||
335 | struct drbd_nl_cfg_req { | ||
336 | int packet_type; | ||
337 | unsigned int drbd_minor; | ||
338 | int flags; | ||
339 | unsigned short tag_list[]; | ||
340 | }; | ||
341 | |||
342 | struct drbd_nl_cfg_reply { | ||
343 | int packet_type; | ||
344 | unsigned int minor; | ||
345 | int ret_code; /* enum ret_code or set_st_err_t */ | ||
346 | unsigned short tag_list[]; /* only used with get_* calls */ | ||
347 | }; | ||
348 | |||
349 | #endif | ||
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h new file mode 100644 index 000000000000..9d067ce46960 --- /dev/null +++ b/include/linux/drbd_limits.h | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | drbd_limits.h | ||
3 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * Our current limitations. | ||
8 | * Some of them are hard limits, | ||
9 | * some of them are arbitrary range limits, that make it easier to provide | ||
10 | * feedback about nonsense settings for certain configurable values. | ||
11 | */ | ||
12 | |||
13 | #ifndef DRBD_LIMITS_H | ||
14 | #define DRBD_LIMITS_H 1 | ||
15 | |||
16 | #define DEBUG_RANGE_CHECK 0 | ||
17 | |||
18 | #define DRBD_MINOR_COUNT_MIN 1 | ||
19 | #define DRBD_MINOR_COUNT_MAX 255 | ||
20 | |||
21 | #define DRBD_DIALOG_REFRESH_MIN 0 | ||
22 | #define DRBD_DIALOG_REFRESH_MAX 600 | ||
23 | |||
24 | /* valid port number */ | ||
25 | #define DRBD_PORT_MIN 1 | ||
26 | #define DRBD_PORT_MAX 0xffff | ||
27 | |||
28 | /* startup { */ | ||
29 | /* if you want more than 3.4 days, disable */ | ||
30 | #define DRBD_WFC_TIMEOUT_MIN 0 | ||
31 | #define DRBD_WFC_TIMEOUT_MAX 300000 | ||
32 | #define DRBD_WFC_TIMEOUT_DEF 0 | ||
33 | |||
34 | #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 | ||
35 | #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 | ||
36 | #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 | ||
37 | |||
38 | #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 | ||
39 | #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 | ||
40 | #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 | ||
41 | /* }*/ | ||
42 | |||
43 | /* net { */ | ||
44 | /* timeout, unit centi seconds | ||
45 | * more than one minute timeout is not usefull */ | ||
46 | #define DRBD_TIMEOUT_MIN 1 | ||
47 | #define DRBD_TIMEOUT_MAX 600 | ||
48 | #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ | ||
49 | |||
50 | /* active connection retries when C_WF_CONNECTION */ | ||
51 | #define DRBD_CONNECT_INT_MIN 1 | ||
52 | #define DRBD_CONNECT_INT_MAX 120 | ||
53 | #define DRBD_CONNECT_INT_DEF 10 /* seconds */ | ||
54 | |||
55 | /* keep-alive probes when idle */ | ||
56 | #define DRBD_PING_INT_MIN 1 | ||
57 | #define DRBD_PING_INT_MAX 120 | ||
58 | #define DRBD_PING_INT_DEF 10 | ||
59 | |||
60 | /* timeout for the ping packets.*/ | ||
61 | #define DRBD_PING_TIMEO_MIN 1 | ||
62 | #define DRBD_PING_TIMEO_MAX 100 | ||
63 | #define DRBD_PING_TIMEO_DEF 5 | ||
64 | |||
65 | /* max number of write requests between write barriers */ | ||
66 | #define DRBD_MAX_EPOCH_SIZE_MIN 1 | ||
67 | #define DRBD_MAX_EPOCH_SIZE_MAX 20000 | ||
68 | #define DRBD_MAX_EPOCH_SIZE_DEF 2048 | ||
69 | |||
70 | /* I don't think that a tcp send buffer of more than 10M is usefull */ | ||
71 | #define DRBD_SNDBUF_SIZE_MIN 0 | ||
72 | #define DRBD_SNDBUF_SIZE_MAX (10<<20) | ||
73 | #define DRBD_SNDBUF_SIZE_DEF (2*65535) | ||
74 | |||
75 | #define DRBD_RCVBUF_SIZE_MIN 0 | ||
76 | #define DRBD_RCVBUF_SIZE_MAX (10<<20) | ||
77 | #define DRBD_RCVBUF_SIZE_DEF (2*65535) | ||
78 | |||
79 | /* @4k PageSize -> 128kB - 512MB */ | ||
80 | #define DRBD_MAX_BUFFERS_MIN 32 | ||
81 | #define DRBD_MAX_BUFFERS_MAX 131072 | ||
82 | #define DRBD_MAX_BUFFERS_DEF 2048 | ||
83 | |||
84 | /* @4k PageSize -> 4kB - 512MB */ | ||
85 | #define DRBD_UNPLUG_WATERMARK_MIN 1 | ||
86 | #define DRBD_UNPLUG_WATERMARK_MAX 131072 | ||
87 | #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) | ||
88 | |||
89 | /* 0 is disabled. | ||
90 | * 200 should be more than enough even for very short timeouts */ | ||
91 | #define DRBD_KO_COUNT_MIN 0 | ||
92 | #define DRBD_KO_COUNT_MAX 200 | ||
93 | #define DRBD_KO_COUNT_DEF 0 | ||
94 | /* } */ | ||
95 | |||
96 | /* syncer { */ | ||
97 | /* FIXME allow rate to be zero? */ | ||
98 | #define DRBD_RATE_MIN 1 | ||
99 | /* channel bonding 10 GbE, or other hardware */ | ||
100 | #define DRBD_RATE_MAX (4 << 20) | ||
101 | #define DRBD_RATE_DEF 250 /* kb/second */ | ||
102 | |||
103 | /* less than 7 would hit performance unneccessarily. | ||
104 | * 3833 is the largest prime that still does fit | ||
105 | * into 64 sectors of activity log */ | ||
106 | #define DRBD_AL_EXTENTS_MIN 7 | ||
107 | #define DRBD_AL_EXTENTS_MAX 3833 | ||
108 | #define DRBD_AL_EXTENTS_DEF 127 | ||
109 | |||
110 | #define DRBD_AFTER_MIN -1 | ||
111 | #define DRBD_AFTER_MAX 255 | ||
112 | #define DRBD_AFTER_DEF -1 | ||
113 | |||
114 | /* } */ | ||
115 | |||
116 | /* drbdsetup XY resize -d Z | ||
117 | * you are free to reduce the device size to nothing, if you want to. | ||
118 | * the upper limit with 64bit kernel, enough ram and flexible meta data | ||
119 | * is 16 TB, currently. */ | ||
120 | /* DRBD_MAX_SECTORS */ | ||
121 | #define DRBD_DISK_SIZE_SECT_MIN 0 | ||
122 | #define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30)) | ||
123 | #define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ | ||
124 | |||
125 | #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON | ||
126 | #define DRBD_FENCING_DEF FP_DONT_CARE | ||
127 | #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT | ||
128 | #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT | ||
129 | #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT | ||
130 | #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT | ||
131 | |||
132 | #define DRBD_MAX_BIO_BVECS_MIN 0 | ||
133 | #define DRBD_MAX_BIO_BVECS_MAX 128 | ||
134 | #define DRBD_MAX_BIO_BVECS_DEF 0 | ||
135 | |||
136 | #undef RANGE | ||
137 | #endif | ||
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h new file mode 100644 index 000000000000..db5721ad50d1 --- /dev/null +++ b/include/linux/drbd_nl.h | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | PAKET( name, | ||
3 | TYPE ( pn, pr, member ) | ||
4 | ... | ||
5 | ) | ||
6 | |||
7 | You may never reissue one of the pn arguments | ||
8 | */ | ||
9 | |||
10 | #if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) | ||
11 | #error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" | ||
12 | #endif | ||
13 | |||
14 | NL_PACKET(primary, 1, | ||
15 | NL_BIT( 1, T_MAY_IGNORE, overwrite_peer) | ||
16 | ) | ||
17 | |||
18 | NL_PACKET(secondary, 2, ) | ||
19 | |||
20 | NL_PACKET(disk_conf, 3, | ||
21 | NL_INT64( 2, T_MAY_IGNORE, disk_size) | ||
22 | NL_STRING( 3, T_MANDATORY, backing_dev, 128) | ||
23 | NL_STRING( 4, T_MANDATORY, meta_dev, 128) | ||
24 | NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) | ||
25 | NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) | ||
26 | NL_INTEGER( 7, T_MAY_IGNORE, fencing) | ||
27 | NL_BIT( 37, T_MAY_IGNORE, use_bmbv) | ||
28 | NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) | ||
29 | NL_BIT( 54, T_MAY_IGNORE, no_md_flush) | ||
30 | /* 55 max_bio_size was available in 8.2.6rc2 */ | ||
31 | NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) | ||
32 | NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) | ||
33 | NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) | ||
34 | ) | ||
35 | |||
36 | NL_PACKET(detach, 4, ) | ||
37 | |||
38 | NL_PACKET(net_conf, 5, | ||
39 | NL_STRING( 8, T_MANDATORY, my_addr, 128) | ||
40 | NL_STRING( 9, T_MANDATORY, peer_addr, 128) | ||
41 | NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) | ||
42 | NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) | ||
43 | NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) | ||
44 | NL_INTEGER( 14, T_MAY_IGNORE, timeout) | ||
45 | NL_INTEGER( 15, T_MANDATORY, wire_protocol) | ||
46 | NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) | ||
47 | NL_INTEGER( 17, T_MAY_IGNORE, ping_int) | ||
48 | NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) | ||
49 | NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) | ||
50 | NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) | ||
51 | NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) | ||
52 | NL_INTEGER( 22, T_MAY_IGNORE, ko_count) | ||
53 | NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) | ||
54 | NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) | ||
55 | NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) | ||
56 | NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) | ||
57 | NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) | ||
58 | NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) | ||
59 | /* 59 addr_family was available in GIT, never released */ | ||
60 | NL_BIT( 60, T_MANDATORY, mind_af) | ||
61 | NL_BIT( 27, T_MAY_IGNORE, want_lose) | ||
62 | NL_BIT( 28, T_MAY_IGNORE, two_primaries) | ||
63 | NL_BIT( 41, T_MAY_IGNORE, always_asbp) | ||
64 | NL_BIT( 61, T_MAY_IGNORE, no_cork) | ||
65 | NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) | ||
66 | ) | ||
67 | |||
68 | NL_PACKET(disconnect, 6, ) | ||
69 | |||
70 | NL_PACKET(resize, 7, | ||
71 | NL_INT64( 29, T_MAY_IGNORE, resize_size) | ||
72 | ) | ||
73 | |||
74 | NL_PACKET(syncer_conf, 8, | ||
75 | NL_INTEGER( 30, T_MAY_IGNORE, rate) | ||
76 | NL_INTEGER( 31, T_MAY_IGNORE, after) | ||
77 | NL_INTEGER( 32, T_MAY_IGNORE, al_extents) | ||
78 | NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) | ||
79 | NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) | ||
80 | NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) | ||
81 | NL_BIT( 65, T_MAY_IGNORE, use_rle) | ||
82 | ) | ||
83 | |||
84 | NL_PACKET(invalidate, 9, ) | ||
85 | NL_PACKET(invalidate_peer, 10, ) | ||
86 | NL_PACKET(pause_sync, 11, ) | ||
87 | NL_PACKET(resume_sync, 12, ) | ||
88 | NL_PACKET(suspend_io, 13, ) | ||
89 | NL_PACKET(resume_io, 14, ) | ||
90 | NL_PACKET(outdate, 15, ) | ||
91 | NL_PACKET(get_config, 16, ) | ||
92 | NL_PACKET(get_state, 17, | ||
93 | NL_INTEGER( 33, T_MAY_IGNORE, state_i) | ||
94 | ) | ||
95 | |||
96 | NL_PACKET(get_uuids, 18, | ||
97 | NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) | ||
98 | NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) | ||
99 | ) | ||
100 | |||
101 | NL_PACKET(get_timeout_flag, 19, | ||
102 | NL_BIT( 36, T_MAY_IGNORE, use_degraded) | ||
103 | ) | ||
104 | |||
105 | NL_PACKET(call_helper, 20, | ||
106 | NL_STRING( 38, T_MAY_IGNORE, helper, 32) | ||
107 | ) | ||
108 | |||
109 | /* Tag nr 42 already allocated in drbd-8.1 development. */ | ||
110 | |||
111 | NL_PACKET(sync_progress, 23, | ||
112 | NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) | ||
113 | ) | ||
114 | |||
115 | NL_PACKET(dump_ee, 24, | ||
116 | NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) | ||
117 | NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) | ||
118 | NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) | ||
119 | NL_INT64( 48, T_MAY_IGNORE, ee_sector) | ||
120 | NL_INT64( 49, T_MAY_IGNORE, ee_block_id) | ||
121 | NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) | ||
122 | ) | ||
123 | |||
124 | NL_PACKET(start_ov, 25, | ||
125 | NL_INT64( 66, T_MAY_IGNORE, start_sector) | ||
126 | ) | ||
127 | |||
128 | NL_PACKET(new_c_uuid, 26, | ||
129 | NL_BIT( 63, T_MANDATORY, clear_bm) | ||
130 | ) | ||
131 | |||
132 | #undef NL_PACKET | ||
133 | #undef NL_INTEGER | ||
134 | #undef NL_INT64 | ||
135 | #undef NL_BIT | ||
136 | #undef NL_STRING | ||
137 | |||
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h new file mode 100644 index 000000000000..fcdff8410e99 --- /dev/null +++ b/include/linux/drbd_tag_magic.h | |||
@@ -0,0 +1,83 @@ | |||
1 | #ifndef DRBD_TAG_MAGIC_H | ||
2 | #define DRBD_TAG_MAGIC_H | ||
3 | |||
4 | #define TT_END 0 | ||
5 | #define TT_REMOVED 0xE000 | ||
6 | |||
7 | /* declare packet_type enums */ | ||
8 | enum packet_types { | ||
9 | #define NL_PACKET(name, number, fields) P_ ## name = number, | ||
10 | #define NL_INTEGER(pn, pr, member) | ||
11 | #define NL_INT64(pn, pr, member) | ||
12 | #define NL_BIT(pn, pr, member) | ||
13 | #define NL_STRING(pn, pr, member, len) | ||
14 | #include "drbd_nl.h" | ||
15 | P_nl_after_last_packet, | ||
16 | }; | ||
17 | |||
18 | /* These struct are used to deduce the size of the tag lists: */ | ||
19 | #define NL_PACKET(name, number, fields) \ | ||
20 | struct name ## _tag_len_struct { fields }; | ||
21 | #define NL_INTEGER(pn, pr, member) \ | ||
22 | int member; int tag_and_len ## member; | ||
23 | #define NL_INT64(pn, pr, member) \ | ||
24 | __u64 member; int tag_and_len ## member; | ||
25 | #define NL_BIT(pn, pr, member) \ | ||
26 | unsigned char member:1; int tag_and_len ## member; | ||
27 | #define NL_STRING(pn, pr, member, len) \ | ||
28 | unsigned char member[len]; int member ## _len; \ | ||
29 | int tag_and_len ## member; | ||
30 | #include "linux/drbd_nl.h" | ||
31 | |||
32 | /* declate tag-list-sizes */ | ||
33 | static const int tag_list_sizes[] = { | ||
34 | #define NL_PACKET(name, number, fields) 2 fields , | ||
35 | #define NL_INTEGER(pn, pr, member) + 4 + 4 | ||
36 | #define NL_INT64(pn, pr, member) + 4 + 8 | ||
37 | #define NL_BIT(pn, pr, member) + 4 + 1 | ||
38 | #define NL_STRING(pn, pr, member, len) + 4 + (len) | ||
39 | #include "drbd_nl.h" | ||
40 | }; | ||
41 | |||
42 | /* The two highest bits are used for the tag type */ | ||
43 | #define TT_MASK 0xC000 | ||
44 | #define TT_INTEGER 0x0000 | ||
45 | #define TT_INT64 0x4000 | ||
46 | #define TT_BIT 0x8000 | ||
47 | #define TT_STRING 0xC000 | ||
48 | /* The next bit indicates if processing of the tag is mandatory */ | ||
49 | #define T_MANDATORY 0x2000 | ||
50 | #define T_MAY_IGNORE 0x0000 | ||
51 | #define TN_MASK 0x1fff | ||
52 | /* The remaining 13 bits are used to enumerate the tags */ | ||
53 | |||
54 | #define tag_type(T) ((T) & TT_MASK) | ||
55 | #define tag_number(T) ((T) & TN_MASK) | ||
56 | |||
57 | /* declare tag enums */ | ||
58 | #define NL_PACKET(name, number, fields) fields | ||
59 | enum drbd_tags { | ||
60 | #define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , | ||
61 | #define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , | ||
62 | #define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , | ||
63 | #define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , | ||
64 | #include "drbd_nl.h" | ||
65 | }; | ||
66 | |||
67 | struct tag { | ||
68 | const char *name; | ||
69 | int type_n_flags; | ||
70 | int max_len; | ||
71 | }; | ||
72 | |||
73 | /* declare tag names */ | ||
74 | #define NL_PACKET(name, number, fields) fields | ||
75 | static const struct tag tag_descriptions[] = { | ||
76 | #define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, | ||
77 | #define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, | ||
78 | #define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, | ||
79 | #define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, | ||
80 | #include "drbd_nl.h" | ||
81 | }; | ||
82 | |||
83 | #endif | ||
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h new file mode 100644 index 000000000000..3a2b2d9b0472 --- /dev/null +++ b/include/linux/lru_cache.h | |||
@@ -0,0 +1,294 @@ | |||
1 | /* | ||
2 | lru_cache.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | drbd is free software; you can redistribute it and/or modify | ||
11 | it under the terms of the GNU General Public License as published by | ||
12 | the Free Software Foundation; either version 2, or (at your option) | ||
13 | any later version. | ||
14 | |||
15 | drbd is distributed in the hope that it will be useful, | ||
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | GNU General Public License for more details. | ||
19 | |||
20 | You should have received a copy of the GNU General Public License | ||
21 | along with drbd; see the file COPYING. If not, write to | ||
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | |||
24 | */ | ||
25 | |||
26 | #ifndef LRU_CACHE_H | ||
27 | #define LRU_CACHE_H | ||
28 | |||
29 | #include <linux/list.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/bitops.h> | ||
32 | #include <linux/string.h> /* for memset */ | ||
33 | #include <linux/seq_file.h> | ||
34 | |||
35 | /* | ||
36 | This header file (and its .c file; kernel-doc of functions see there) | ||
37 | define a helper framework to easily keep track of index:label associations, | ||
38 | and changes to an "active set" of objects, as well as pending transactions, | ||
39 | to persistently record those changes. | ||
40 | |||
41 | We use an LRU policy if it is necessary to "cool down" a region currently in | ||
42 | the active set before we can "heat" a previously unused region. | ||
43 | |||
44 | Because of this later property, it is called "lru_cache". | ||
45 | As it actually Tracks Objects in an Active SeT, we could also call it | ||
46 | toast (incidentally that is what may happen to the data on the | ||
47 | backend storage uppon next resync, if we don't get it right). | ||
48 | |||
49 | What for? | ||
50 | |||
51 | We replicate IO (more or less synchronously) to local and remote disk. | ||
52 | |||
53 | For crash recovery after replication node failure, | ||
54 | we need to resync all regions that have been target of in-flight WRITE IO | ||
55 | (in use, or "hot", regions), as we don't know wether or not those WRITEs have | ||
56 | made it to stable storage. | ||
57 | |||
58 | To avoid a "full resync", we need to persistently track these regions. | ||
59 | |||
60 | This is known as "write intent log", and can be implemented as on-disk | ||
61 | (coarse or fine grained) bitmap, or other meta data. | ||
62 | |||
63 | To avoid the overhead of frequent extra writes to this meta data area, | ||
64 | usually the condition is softened to regions that _may_ have been target of | ||
65 | in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent | ||
66 | bitmap, trading frequency of meta data transactions against amount of | ||
67 | (possibly unneccessary) resync traffic. | ||
68 | |||
69 | If we set a hard limit on the area that may be "hot" at any given time, we | ||
70 | limit the amount of resync traffic needed for crash recovery. | ||
71 | |||
72 | For recovery after replication link failure, | ||
73 | we need to resync all blocks that have been changed on the other replica | ||
74 | in the mean time, or, if both replica have been changed independently [*], | ||
75 | all blocks that have been changed on either replica in the mean time. | ||
76 | [*] usually as a result of a cluster split-brain and insufficient protection. | ||
77 | but there are valid use cases to do this on purpose. | ||
78 | |||
79 | Tracking those blocks can be implemented as "dirty bitmap". | ||
80 | Having it fine-grained reduces the amount of resync traffic. | ||
81 | It should also be persistent, to allow for reboots (or crashes) | ||
82 | while the replication link is down. | ||
83 | |||
84 | There are various possible implementations for persistently storing | ||
85 | write intent log information, three of which are mentioned here. | ||
86 | |||
87 | "Chunk dirtying" | ||
88 | The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well. | ||
89 | To reduce the frequency of bitmap updates for write-intent log purposes, | ||
90 | one could dirty "chunks" (of some size) at a time of the (fine grained) | ||
91 | on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as | ||
92 | possible, flushing it to disk again when a previously "hot" (and on-disk | ||
93 | dirtied as full chunk) area "cools down" again (no IO in flight anymore, | ||
94 | and none expected in the near future either). | ||
95 | |||
96 | "Explicit (coarse) write intent bitmap" | ||
97 | An other implementation could chose a (probably coarse) explicit bitmap, | ||
98 | for write-intent log purposes, additionally to the fine grained dirty bitmap. | ||
99 | |||
100 | "Activity log" | ||
101 | Yet an other implementation may keep track of the hot regions, by starting | ||
102 | with an empty set, and writing down a journal of region numbers that have | ||
103 | become "hot", or have "cooled down" again. | ||
104 | |||
105 | To be able to use a ring buffer for this journal of changes to the active | ||
106 | set, we not only record the actual changes to that set, but also record the | ||
107 | not changing members of the set in a round robin fashion. To do so, we use a | ||
108 | fixed (but configurable) number of slots which we can identify by index, and | ||
109 | associate region numbers (labels) with these indices. | ||
110 | For each transaction recording a change to the active set, we record the | ||
111 | change itself (index: -old_label, +new_label), and which index is associated | ||
112 | with which label (index: current_label) within a certain sliding window that | ||
113 | is moved further over the available indices with each such transaction. | ||
114 | |||
115 | Thus, for crash recovery, if the ringbuffer is sufficiently large, we can | ||
116 | accurately reconstruct the active set. | ||
117 | |||
118 | Sufficiently large depends only on maximum number of active objects, and the | ||
119 | size of the sliding window recording "index: current_label" associations within | ||
120 | each transaction. | ||
121 | |||
122 | This is what we call the "activity log". | ||
123 | |||
124 | Currently we need one activity log transaction per single label change, which | ||
125 | does not give much benefit over the "dirty chunks of bitmap" approach, other | ||
126 | than potentially less seeks. | ||
127 | |||
128 | We plan to change the transaction format to support multiple changes per | ||
129 | transaction, which then would reduce several (disjoint, "random") updates to | ||
130 | the bitmap into one transaction to the activity log ring buffer. | ||
131 | */ | ||
132 | |||
133 | /* this defines an element in a tracked set | ||
134 | * .colision is for hash table lookup. | ||
135 | * When we process a new IO request, we know its sector, thus can deduce the | ||
136 | * region number (label) easily. To do the label -> object lookup without a | ||
137 | * full list walk, we use a simple hash table. | ||
138 | * | ||
139 | * .list is on one of three lists: | ||
140 | * in_use: currently in use (refcnt > 0, lc_number != LC_FREE) | ||
141 | * lru: unused but ready to be reused or recycled | ||
142 | * (ts_refcnt == 0, lc_number != LC_FREE), | ||
143 | * free: unused but ready to be recycled | ||
144 | * (ts_refcnt == 0, lc_number == LC_FREE), | ||
145 | * | ||
146 | * an element is said to be "in the active set", | ||
147 | * if either on "in_use" or "lru", i.e. lc_number != LC_FREE. | ||
148 | * | ||
149 | * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache | ||
150 | * (total memory usage 2 pages), and up to 3833 elements on the act_log | ||
151 | * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages. | ||
152 | * | ||
153 | * We usually do not actually free these objects again, but only "recycle" | ||
154 | * them, as the change "index: -old_label, +LC_FREE" would need a transaction | ||
155 | * as well. Which also means that using a kmem_cache to allocate the objects | ||
156 | * from wastes some resources. | ||
157 | * But it avoids high order page allocations in kmalloc. | ||
158 | */ | ||
159 | struct lc_element { | ||
160 | struct hlist_node colision; | ||
161 | struct list_head list; /* LRU list or free list */ | ||
162 | unsigned refcnt; | ||
163 | /* back "pointer" into ts_cache->element[index], | ||
164 | * for paranoia, and for "ts_element_to_index" */ | ||
165 | unsigned lc_index; | ||
166 | /* if we want to track a larger set of objects, | ||
167 | * it needs to become arch independend u64 */ | ||
168 | unsigned lc_number; | ||
169 | |||
170 | /* special label when on free list */ | ||
171 | #define LC_FREE (~0U) | ||
172 | }; | ||
173 | |||
174 | struct lru_cache { | ||
175 | /* the least recently used item is kept at lru->prev */ | ||
176 | struct list_head lru; | ||
177 | struct list_head free; | ||
178 | struct list_head in_use; | ||
179 | |||
180 | /* the pre-created kmem cache to allocate the objects from */ | ||
181 | struct kmem_cache *lc_cache; | ||
182 | |||
183 | /* size of tracked objects, used to memset(,0,) them in lc_reset */ | ||
184 | size_t element_size; | ||
185 | /* offset of struct lc_element member in the tracked object */ | ||
186 | size_t element_off; | ||
187 | |||
188 | /* number of elements (indices) */ | ||
189 | unsigned int nr_elements; | ||
190 | /* Arbitrary limit on maximum tracked objects. Practical limit is much | ||
191 | * lower due to allocation failures, probably. For typical use cases, | ||
192 | * nr_elements should be a few thousand at most. | ||
193 | * This also limits the maximum value of ts_element.ts_index, allowing the | ||
194 | * 8 high bits of .ts_index to be overloaded with flags in the future. */ | ||
195 | #define LC_MAX_ACTIVE (1<<24) | ||
196 | |||
197 | /* statistics */ | ||
198 | unsigned used; /* number of lelements currently on in_use list */ | ||
199 | unsigned long hits, misses, starving, dirty, changed; | ||
200 | |||
201 | /* see below: flag-bits for lru_cache */ | ||
202 | unsigned long flags; | ||
203 | |||
204 | /* when changing the label of an index element */ | ||
205 | unsigned int new_number; | ||
206 | |||
207 | /* for paranoia when changing the label of an index element */ | ||
208 | struct lc_element *changing_element; | ||
209 | |||
210 | void *lc_private; | ||
211 | const char *name; | ||
212 | |||
213 | /* nr_elements there */ | ||
214 | struct hlist_head *lc_slot; | ||
215 | struct lc_element **lc_element; | ||
216 | }; | ||
217 | |||
218 | |||
219 | /* flag-bits for lru_cache */ | ||
220 | enum { | ||
221 | /* debugging aid, to catch concurrent access early. | ||
222 | * user needs to guarantee exclusive access by proper locking! */ | ||
223 | __LC_PARANOIA, | ||
224 | /* if we need to change the set, but currently there is a changing | ||
225 | * transaction pending, we are "dirty", and must deferr further | ||
226 | * changing requests */ | ||
227 | __LC_DIRTY, | ||
228 | /* if we need to change the set, but currently there is no free nor | ||
229 | * unused element available, we are "starving", and must not give out | ||
230 | * further references, to guarantee that eventually some refcnt will | ||
231 | * drop to zero and we will be able to make progress again, changing | ||
232 | * the set, writing the transaction. | ||
233 | * if the statistics say we are frequently starving, | ||
234 | * nr_elements is too small. */ | ||
235 | __LC_STARVING, | ||
236 | }; | ||
237 | #define LC_PARANOIA (1<<__LC_PARANOIA) | ||
238 | #define LC_DIRTY (1<<__LC_DIRTY) | ||
239 | #define LC_STARVING (1<<__LC_STARVING) | ||
240 | |||
241 | extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | ||
242 | unsigned e_count, size_t e_size, size_t e_off); | ||
243 | extern void lc_reset(struct lru_cache *lc); | ||
244 | extern void lc_destroy(struct lru_cache *lc); | ||
245 | extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); | ||
246 | extern void lc_del(struct lru_cache *lc, struct lc_element *element); | ||
247 | |||
248 | extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); | ||
249 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); | ||
250 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); | ||
251 | extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); | ||
252 | extern void lc_changed(struct lru_cache *lc, struct lc_element *e); | ||
253 | |||
254 | struct seq_file; | ||
255 | extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); | ||
256 | |||
257 | extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, | ||
258 | void (*detail) (struct seq_file *, struct lc_element *)); | ||
259 | |||
260 | /** | ||
261 | * lc_try_lock - can be used to stop lc_get() from changing the tracked set | ||
262 | * @lc: the lru cache to operate on | ||
263 | * | ||
264 | * Note that the reference counts and order on the active and lru lists may | ||
265 | * still change. Returns true if we aquired the lock. | ||
266 | */ | ||
267 | static inline int lc_try_lock(struct lru_cache *lc) | ||
268 | { | ||
269 | return !test_and_set_bit(__LC_DIRTY, &lc->flags); | ||
270 | } | ||
271 | |||
272 | /** | ||
273 | * lc_unlock - unlock @lc, allow lc_get() to change the set again | ||
274 | * @lc: the lru cache to operate on | ||
275 | */ | ||
276 | static inline void lc_unlock(struct lru_cache *lc) | ||
277 | { | ||
278 | clear_bit(__LC_DIRTY, &lc->flags); | ||
279 | smp_mb__after_clear_bit(); | ||
280 | } | ||
281 | |||
282 | static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) | ||
283 | { | ||
284 | struct lc_element *e = lc_find(lc, enr); | ||
285 | return e && e->refcnt; | ||
286 | } | ||
287 | |||
288 | #define lc_entry(ptr, type, member) \ | ||
289 | container_of(ptr, type, member) | ||
290 | |||
291 | extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); | ||
292 | extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); | ||
293 | |||
294 | #endif | ||