aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/multipath_wrandom.c
blob: 7e22f15d13dfecb562b58adadc409282835faba3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*
 *              Weighted random policy for multipath.
 *
 *
 * Version:	$Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
 *
 * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/ip_fib.h>
#include <net/ip_mp_alg.h>

#define MULTIPATH_STATE_SIZE 15

struct multipath_candidate {
	struct multipath_candidate	*next;
	int				power;
	struct rtable			*rt;
};

struct multipath_dest {
	struct list_head	list;

	const struct fib_nh	*nh_info;
	__be32			netmask;
	__be32			network;
	unsigned char		prefixlen;

	struct rcu_head		rcu;
};

struct multipath_bucket {
	struct list_head	head;
	spinlock_t		lock;
};

struct multipath_route {
	struct list_head	list;

	int			oif;
	__be32			gw;
	struct list_head	dests;

	struct rcu_head		rcu;
};

/* state: primarily weight per route information */
static struct multipath_bucket state[MULTIPATH_STATE_SIZE];

/* interface to random number generation */
static unsigned int RANDOM_SEED = 93186752;

static inline unsigned int random(unsigned int ubound)
{
	static unsigned int a = 1588635695,
		q = 2,
		r = 1117695901;
	RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
	return RANDOM_SEED % ubound;
}

static unsigned char __multipath_lookup_weight(const struct flowi *fl,
					       const struct rtable *rt)
{
	const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
	struct multipath_route *r;
	struct multipath_route *target_route = NULL;
	struct multipath_dest *d;
	int weight = 1;

	/* lookup the weight information for a certain route */
	rcu_read_lock();

	/* find state entry for gateway or add one if necessary */
	list_for_each_entry_rcu(r, &state[state_idx].head, list) {
		if (r->gw == rt->rt_gateway &&
		    r->oif == rt->idev->dev->ifindex) {
			target_route = r;
			break;
		}
	}

	if (!target_route) {
		/* this should not happen... but we are prepared */
		printk( KERN_CRIT"%s: missing state for gateway: %u and " \
			"device %d\n", __FUNCTION__, rt->rt_gateway,
			rt->idev->dev->ifindex);
		goto out;
	}

	/* find state entry for destination */
	list_for_each_entry_rcu(d, &target_route->dests, list) {
		__be32 targetnetwork = fl->fl4_dst &
			inet_make_mask(d->prefixlen);

		if ((targetnetwork & d->netmask) == d->network) {
			weight = d->nh_info->nh_weight;
			goto out;
		}
	}

out:
	rcu_read_unlock();
	return weight;
}

static void wrandom_init_state(void)
{
	int i;

	for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
		INIT_LIST_HEAD(&state[i].head);
		spin_lock_init(&state[i].lock);
	}
}

static void wrandom_select_route(const struct flowi *flp,
				 struct rtable *first,
				 struct rtable **rp)
{
	struct rtable *rt;
	struct rtable *decision;
	struct multipath_candidate *first_mpc = NULL;
	struct multipath_candidate *mpc, *last_mpc = NULL;
	int power = 0;
	int last_power;
	int selector;
	const size_t size_mpc = sizeof(struct multipath_candidate);

	/* collect all candidates and identify their weights */
	for (rt = rcu_dereference(first); rt;
	     rt = rcu_dereference(rt->u.dst.rt_next)) {
		if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
		    multipath_comparekeys(&rt->fl, flp)) {
			struct multipath_candidate* mpc =
				(struct multipath_candidate*)
				kmalloc(size_mpc, GFP_ATOMIC);

			if (!mpc)
				return;

			power += __multipath_lookup_weight(flp, rt) * 10000;

			mpc->power = power;
			mpc->rt = rt;
			mpc->next = NULL;

			if (!first_mpc)
				first_mpc = mpc;
			else
				last_mpc->next = mpc;

			last_mpc = mpc;
		}
	}

	/* choose a weighted random candidate */
	decision = first;
	selector = random(power);
	last_power = 0;

	/* select candidate, adjust GC data and cleanup local state */
	decision = first;
	last_mpc = NULL;
	for (mpc = first_mpc; mpc; mpc = mpc->next) {
		mpc->rt->u.dst.lastuse = jiffies;
		if (last_power <= selector && selector < mpc->power)
			decision = mpc->rt;

		last_power = mpc->power;
		kfree(last_mpc);
		last_mpc = mpc;
	}

	/* concurrent __multipath_flush may lead to !last_mpc */
	kfree(last_mpc);

	decision->u.dst.__use++;
	*rp = decision;
}

static void wrandom_set_nhinfo(__be32 network,
			       __be32 netmask,
			       unsigned char prefixlen,
			       const struct fib_nh *nh)
{
	const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
	struct multipath_route *r, *target_route = NULL;
	struct multipath_dest *d, *target_dest = NULL;

	/* store the weight information for a certain route */
	spin_lock_bh(&state[state_idx].lock);

	/* find state entry for gateway or add one if necessary */
	list_for_each_entry_rcu(r, &state[state_idx].head, list) {
		if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
			target_route = r;
			break;
		}
	}

	if (!target_route) {
		const size_t size_rt = sizeof(struct multipath_route);
		target_route = (struct multipath_route *)
			kmalloc(size_rt, GFP_ATOMIC);

		target_route->gw = nh->nh_gw;
		target_route->oif = nh->nh_oif;
		memset(&target_route->rcu, 0, sizeof(struct rcu_head));
		INIT_LIST_HEAD(&target_route->dests);

		list_add_rcu(&target_route->list, &state[state_idx].head);
	}

	/* find state entry for destination or add one if necessary */
	list_for_each_entry_rcu(d, &target_route->dests, list) {
		if (d->nh_info == nh) {
			target_dest = d;
			break;
		}
	}

	if (!target_dest) {
		const size_t size_dst = sizeof(struct multipath_dest);
		target_dest = (struct multipath_dest*)
			kmalloc(size_dst, GFP_ATOMIC);

		target_dest->nh_info = nh;
		target_dest->network = network;
		target_dest->netmask = netmask;
		target_dest->prefixlen = prefixlen;
		memset(&target_dest->rcu, 0, sizeof(struct rcu_head));

		list_add_rcu(&target_dest->list, &target_route->dests);
	}
	/* else: we already stored this info for another destination =>
	 * we are finished
	 */

	spin_unlock_bh(&state[state_idx].lock);
}

static void __multipath_free(struct rcu_head *head)
{
	struct multipath_route *rt = container_of(head, struct multipath_route,
						  rcu);
	kfree(rt);
}

static void __multipath_free_dst(struct rcu_head *head)
{
	struct multipath_dest *dst = container_of(head,
						  struct multipath_dest,
						  rcu);
	kfree(dst);
}

static void wrandom_flush(void)
{
	int i;

	/* defere delete to all entries */
	for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
		struct multipath_route *r;

		spin_lock_bh(&state[i].lock);
		list_for_each_entry_rcu(r, &state[i].head, list) {
			struct multipath_dest *d;
			list_for_each_entry_rcu(d, &r->dests, list) {
				list_del_rcu(&d->list);
				call_rcu(&d->rcu,
					 __multipath_free_dst);
			}
			list_del_rcu(&r->list);
			call_rcu(&r->rcu,
				 __multipath_free);
		}

		spin_unlock_bh(&state[i].lock);
	}
}

static struct ip_mp_alg_ops wrandom_ops = {
	.mp_alg_select_route	=	wrandom_select_route,
	.mp_alg_flush		=	wrandom_flush,
	.mp_alg_set_nhinfo	=	wrandom_set_nhinfo,
};

static int __init wrandom_init(void)
{
	wrandom_init_state();

	return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
}

static void __exit wrandom_exit(void)
{
	multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
}

module_init(wrandom_init);
module_exit(wrandom_exit);
MODULE_LICENSE("GPL");