aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/scripts/python/compaction-times.py
blob: 239cb0568ec3e6b941c8a19dd29c4f588c096a42 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# report time spent in compaction
# Licensed under the terms of the GNU GPL License version 2

# testing:
# 'echo 1 > /proc/sys/vm/compact_memory' to force compaction of all zones

import os
import sys
import re

import signal
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

usage = "usage: perf script report compaction-times.py -- [-h] [-u] [-p|-pv] [-t | [-m] [-fs] [-ms]] [pid|pid-range|comm-regex]\n"

class popt:
	DISP_DFL = 0
	DISP_PROC = 1
	DISP_PROC_VERBOSE=2

class topt:
	DISP_TIME = 0
	DISP_MIG = 1
	DISP_ISOLFREE = 2
	DISP_ISOLMIG = 4
	DISP_ALL = 7

class comm_filter:
	def __init__(self, re):
		self.re = re

	def filter(self, pid, comm):
		m = self.re.search(comm)
		return m == None or m.group() == ""

class pid_filter:
	def __init__(self, low, high):
		self.low = (0 if low == "" else int(low))
		self.high = (0 if high == "" else int(high))

	def filter(self, pid, comm):
		return not (pid >= self.low and (self.high == 0 or pid <= self.high))

def set_type(t):
	global opt_disp
	opt_disp = (t if opt_disp == topt.DISP_ALL else opt_disp|t)

def ns(sec, nsec):
	return (sec * 1000000000) + nsec

def time(ns):
	return "%dns" % ns if opt_ns else "%dus" % (round(ns, -3) / 1000)

class pair:
	def __init__(self, aval, bval, alabel = None, blabel = None):
		self.alabel = alabel
		self.blabel = blabel
		self.aval = aval
		self.bval = bval

	def __add__(self, rhs):
		self.aval += rhs.aval
		self.bval += rhs.bval
		return self

	def __str__(self):
		return "%s=%d %s=%d" % (self.alabel, self.aval, self.blabel, self.bval)

class cnode:
	def __init__(self, ns):
		self.ns = ns
		self.migrated = pair(0, 0, "moved", "failed")
		self.fscan = pair(0,0, "scanned", "isolated")
		self.mscan = pair(0,0, "scanned", "isolated")

	def __add__(self, rhs):
		self.ns += rhs.ns
		self.migrated += rhs.migrated
		self.fscan += rhs.fscan
		self.mscan += rhs.mscan
		return self

	def __str__(self):
		prev = 0
		s = "%s " % time(self.ns)
		if (opt_disp & topt.DISP_MIG):
			s += "migration: %s" % self.migrated
			prev = 1
		if (opt_disp & topt.DISP_ISOLFREE):
			s += "%sfree_scanner: %s" % (" " if prev else "", self.fscan)
			prev = 1
		if (opt_disp & topt.DISP_ISOLMIG):
			s += "%smigration_scanner: %s" % (" " if prev else "", self.mscan)
		return s

	def complete(self, secs, nsecs):
		self.ns = ns(secs, nsecs) - self.ns

	def increment(self, migrated, fscan, mscan):
		if (migrated != None):
			self.migrated += migrated
		if (fscan != None):
			self.fscan += fscan
		if (mscan != None):
			self.mscan += mscan


class chead:
	heads = {}
	val = cnode(0);
	fobj = None

	@classmethod
	def add_filter(cls, filter):
		cls.fobj = filter

	@classmethod
	def create_pending(cls, pid, comm, start_secs, start_nsecs):
		filtered = 0
		try:
			head = cls.heads[pid]
			filtered = head.is_filtered()
		except KeyError:
			if cls.fobj != None:
				filtered = cls.fobj.filter(pid, comm)
			head = cls.heads[pid] = chead(comm, pid, filtered)

		if not filtered:
			head.mark_pending(start_secs, start_nsecs)

	@classmethod
	def increment_pending(cls, pid, migrated, fscan, mscan):
		head = cls.heads[pid]
		if not head.is_filtered():
			if head.is_pending():
				head.do_increment(migrated, fscan, mscan)
			else:
				sys.stderr.write("missing start compaction event for pid %d\n" % pid)

	@classmethod
	def complete_pending(cls, pid, secs, nsecs):
		head = cls.heads[pid]
		if not head.is_filtered():
			if head.is_pending():
				head.make_complete(secs, nsecs)
			else:
				sys.stderr.write("missing start compaction event for pid %d\n" % pid)

	@classmethod
	def gen(cls):
		if opt_proc != popt.DISP_DFL:
			for i in cls.heads:
				yield cls.heads[i]

	@classmethod
	def str(cls):
		return cls.val

	def __init__(self, comm, pid, filtered):
		self.comm = comm
		self.pid = pid
		self.val = cnode(0)
		self.pending = None
		self.filtered = filtered
		self.list = []

	def __add__(self, rhs):
		self.ns += rhs.ns
		self.val += rhs.val
		return self

	def mark_pending(self, secs, nsecs):
		self.pending = cnode(ns(secs, nsecs))

	def do_increment(self, migrated, fscan, mscan):
		self.pending.increment(migrated, fscan, mscan)

	def make_complete(self, secs, nsecs):
		self.pending.complete(secs, nsecs)
		chead.val += self.pending

		if opt_proc != popt.DISP_DFL:
			self.val += self.pending

			if opt_proc == popt.DISP_PROC_VERBOSE:
				self.list.append(self.pending)
		self.pending = None

	def enumerate(self):
		if opt_proc == popt.DISP_PROC_VERBOSE and not self.is_filtered():
			for i, pelem in enumerate(self.list):
				sys.stdout.write("%d[%s].%d: %s\n" % (self.pid, self.comm, i+1, pelem))

	def is_pending(self):
		return self.pending != None

	def is_filtered(self):
		return self.filtered

	def display(self):
		if not self.is_filtered():
			sys.stdout.write("%d[%s]: %s\n" % (self.pid, self.comm, self.val))


def trace_end():
	sys.stdout.write("total: %s\n" % chead.str())
	for i in chead.gen():
		i.display(),
		i.enumerate()

def compaction__mm_compaction_migratepages(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	common_callchain, nr_migrated, nr_failed):

	chead.increment_pending(common_pid,
		pair(nr_migrated, nr_failed), None, None)

def compaction__mm_compaction_isolate_freepages(event_name, context, common_cpu,
        common_secs, common_nsecs, common_pid, common_comm,
        common_callchain, start_pfn, end_pfn, nr_scanned, nr_taken):

	chead.increment_pending(common_pid,
		None, pair(nr_scanned, nr_taken), None)

def compaction__mm_compaction_isolate_migratepages(event_name, context, common_cpu,
        common_secs, common_nsecs, common_pid, common_comm,
        common_callchain, start_pfn, end_pfn, nr_scanned, nr_taken):

	chead.increment_pending(common_pid,
		None, None, pair(nr_scanned, nr_taken))

def compaction__mm_compaction_end(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	common_callchain, zone_start, migrate_start, free_start, zone_end,
	sync, status):

	chead.complete_pending(common_pid, common_secs, common_nsecs)

def compaction__mm_compaction_begin(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	common_callchain, zone_start, migrate_start, free_start, zone_end,
	sync):

	chead.create_pending(common_pid, common_comm, common_secs, common_nsecs)

def pr_help():
	global usage

	sys.stdout.write(usage)
	sys.stdout.write("\n")
	sys.stdout.write("-h	display this help\n")
	sys.stdout.write("-p	display by process\n")
	sys.stdout.write("-pv	display by process (verbose)\n")
	sys.stdout.write("-t	display stall times only\n")
	sys.stdout.write("-m	display stats for migration\n")
	sys.stdout.write("-fs	display stats for free scanner\n")
	sys.stdout.write("-ms	display stats for migration scanner\n")
	sys.stdout.write("-u	display results in microseconds (default nanoseconds)\n")


comm_re = None
pid_re = None
pid_regex = "^(\d*)-(\d*)$|^(\d*)$"

opt_proc = popt.DISP_DFL
opt_disp = topt.DISP_ALL

opt_ns = True

argc = len(sys.argv) - 1
if argc >= 1:
	pid_re = re.compile(pid_regex)

	for i, opt in enumerate(sys.argv[1:]):
		if opt[0] == "-":
			if opt == "-h":
				pr_help()
				exit(0);
			elif opt == "-p":
				opt_proc = popt.DISP_PROC
			elif opt == "-pv":
				opt_proc = popt.DISP_PROC_VERBOSE
			elif opt == '-u':
				opt_ns = False
			elif opt == "-t":
				set_type(topt.DISP_TIME)
			elif opt == "-m":
				set_type(topt.DISP_MIG)
			elif opt == "-fs":
				set_type(topt.DISP_ISOLFREE)
			elif opt == "-ms":
				set_type(topt.DISP_ISOLMIG)
			else:
				sys.exit(usage)

		elif i == argc - 1:
			m = pid_re.search(opt)
			if m != None and m.group() != "":
				if m.group(3) != None:
					f = pid_filter(m.group(3), m.group(3))
				else:
					f = pid_filter(m.group(1), m.group(2))
			else:
				try:
					comm_re=re.compile(opt)
				except:
					sys.stderr.write("invalid regex '%s'" % opt)
					sys.exit(usage)
				f = comm_filter(comm_re)

			chead.add_filter(f)
pan class="hl opt">== t->parms.iph.saddr && (t->dev->flags&IFF_UP)) return t; t = rcu_dereference(ipn->tunnels_wc[0]); if (t && (t->dev->flags&IFF_UP)) return t; return NULL; } static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; unsigned int h = 0; int prio = 0; if (remote) { prio |= 2; h ^= HASH(remote); } if (local) { prio |= 1; h ^= HASH(local); } return &ipn->tunnels[prio][h]; } static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, struct ip_tunnel *t) { return __ipip_bucket(ipn, &t->parms); } static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp; struct ip_tunnel *iter; for (tp = ipip_bucket(ipn, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; struct ip_tunnel *t, *nt; struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct ipip_net *ipn = net_generic(net, ipip_net_id); for (tp = __ipip_bucket(ipn, parms); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } if (!create) return NULL; if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else strcpy(name, "tunl%d"); dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); if (dev == NULL) return NULL; dev_net_set(dev, net); if (strchr(name, '%')) { if (dev_alloc_name(dev, name) < 0) goto failed_free; } nt = netdev_priv(dev); nt->parms = *parms; if (ipip_tunnel_init(dev) < 0) goto failed_free; if (register_netdevice(dev) < 0) goto failed_free; dev_hold(dev); ipip_tunnel_link(ipn, nt); return nt; failed_free: ipip_dev_free(dev); return NULL; } /* called with RTNL */ static void ipip_tunnel_uninit(struct net_device *dev) { struct net *net = dev_net(dev); struct ipip_net *ipn = net_generic(net, ipip_net_id); if (dev == ipn->fb_tunnel_dev) rcu_assign_pointer(ipn->tunnels_wc[0], NULL); else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); } static int ipip_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ struct iphdr *iph = (struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err; switch (type) { default: case ICMP_PARAMETERPROB: return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; case ICMP_FRAG_NEEDED: /* Soft state for pmtu is maintained by IP core. */ return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; break; } err = -ENOENT; rcu_read_lock(); t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); if (t == NULL || t->parms.iph.daddr == 0) goto out; err = 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: rcu_read_unlock(); return err; } static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, struct sk_buff *skb) { struct iphdr *inner_iph = ip_hdr(skb); if (INET_ECN_is_ce(outer_iph->tos)) IP_ECN_set_ce(inner_iph); } static int ipip_rcv(struct sk_buff *skb) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); rcu_read_lock(); tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); if (tunnel != NULL) { struct pcpu_tstats *tstats; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); kfree_skb(skb); return 0; } secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; tstats = this_cpu_ptr(tunnel->dev->tstats); tstats->rx_packets++; tstats->rx_bytes += skb->len; __skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); netif_rx(skb); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -1; } /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; int mtu; if (skb->protocol != htons(ETH_P_IP)) goto tx_error; if (tos & 1) tos = old_iph->tos; if (!dst) { /* NBMA tunnel */ if ((rt = skb_rtable(skb)) == NULL) { dev->stats.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) goto tx_error_icmp; } { struct flowi fl = { .oif = tunnel->parms.link, .fl4_dst = dst, .fl4_src= tiph->saddr, .fl4_tos = RT_TOS(tos), .proto = IPPROTO_IPIP }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); dev->stats.collisions++; goto tx_error; } df |= old_iph->frag_off & htons(IP_DF); if (df) { mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); goto tx_error; } } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); } skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. */ iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; if ((iph->ttl = tiph->ttl) == 0) iph->ttl = old_iph->ttl; nf_reset(skb); tstats = this_cpu_ptr(dev->tstats); __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } static void ipip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; struct iphdr *iph; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { struct flowi fl = { .oif = tunnel->parms.link, .fl4_dst = iph->daddr, .fl4_src = iph->saddr, .fl4_tos = RT_TOS(iph->tos), .proto = IPPROTO_IPIP }; struct rtable *rt; if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - sizeof(struct iphdr); } dev->iflink = tunnel->parms.link; } static int ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; struct ip_tunnel *t; struct net *net = dev_net(dev); struct ipip_net *ipn = net_generic(net, ipip_net_id); switch (cmd) { case SIOCGETTUNNEL: t = NULL; if (dev == ipn->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } t = ipip_tunnel_locate(net, &p, 0); } if (t == NULL) t = netdev_priv(dev); memcpy(&p, &t->parms, sizeof(p)); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!capable(CAP_NET_ADMIN)) goto done; err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; err = -EINVAL; if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) goto done; if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; break; } } else { if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { err = -EINVAL; break; } t = netdev_priv(dev); ipip_tunnel_unlink(ipn, t); synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); memcpy(dev->broadcast, &p.iph.daddr, 4); ipip_tunnel_link(ipn, t); netdev_state_change(dev); } } if (t) { err = 0; if (cmd == SIOCCHGTUNNEL) { t->parms.iph.ttl = p.iph.ttl; t->parms.iph.tos = p.iph.tos; t->parms.iph.frag_off = p.iph.frag_off; if (t->parms.link != p.link) { t->parms.link = p.link; ipip_tunnel_bind_dev(dev); netdev_state_change(dev); } } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); break; case SIOCDELTUNNEL: err = -EPERM; if (!capable(CAP_NET_ADMIN)) goto done; if (dev == ipn->fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; err = -ENOENT; if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) goto done; err = -EPERM; if (t->dev == ipn->fb_tunnel_dev) goto done; dev = t->dev; } unregister_netdevice(dev); err = 0; break; default: err = -EINVAL; } done: return err; } static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) return -EINVAL; dev->mtu = new_mtu; return 0; } static const struct net_device_ops ipip_netdev_ops = { .ndo_uninit = ipip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, .ndo_get_stats = ipip_get_stats, }; static void ipip_dev_free(struct net_device *dev) { free_percpu(dev->tstats); free_netdev(dev); } static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip_tunnel_bind_dev(dev); dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; return 0; } static int __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; dev_hold(dev); rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); return 0; } static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, }; static const char banner[] __initconst = KERN_INFO "IPv4 over IPv4 tunneling driver\n"; static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) { int prio; for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { struct ip_tunnel *t; t = rtnl_dereference(ipn->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); t = rtnl_dereference(t->next); } } } } static int __net_init ipip_init_net(struct net *net) { struct ipip_net *ipn = net_generic(net, ipip_net_id); int err; ipn->tunnels[0] = ipn->tunnels_wc; ipn->tunnels[1] = ipn->tunnels_l; ipn->tunnels[2] = ipn->tunnels_r; ipn->tunnels[3] = ipn->tunnels_r_l; ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "tunl0", ipip_tunnel_setup); if (!ipn->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; } dev_net_set(ipn->fb_tunnel_dev, net); err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); if (err) goto err_reg_dev; if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; return 0; err_reg_dev: ipip_dev_free(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ return err; } static void __net_exit ipip_exit_net(struct net *net) { struct ipip_net *ipn = net_generic(net, ipip_net_id); LIST_HEAD(list); rtnl_lock(); ipip_destroy_tunnels(ipn, &list); unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit = ipip_exit_net, .id = &ipip_net_id, .size = sizeof(struct ipip_net), }; static int __init ipip_init(void) { int err; printk(banner); err = register_pernet_device(&ipip_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { unregister_pernet_device(&ipip_net_ops); printk(KERN_INFO "ipip init: can't register tunnel\n"); } return err; } static void __exit ipip_fini(void) { if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) printk(KERN_INFO "ipip close: can't deregister tunnel\n"); unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS("tunl0");