aboutsummaryrefslogblamecommitdiffstats
path: root/net/ipv4/ip_gre.c
blob: 5c14ed63e56c97c8b53f188fd0bfd6997a3e8f8c (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
  
                                                     









                                                                     
                             

                         













                                 
                           

































                                                                    
                                                                  















































































                                                                              
                                                                   











                                                                
                                                                                      





























                                                                                  
                                                        


                    
                                                                      
 


                                         












                                           




                                                                  

























                                                                                       


                                         

                                       

                            
                                                                            

























                                                                                  
                              






                                          

                              







                                                       
                                              





















                                                                           
                                                             
                                        

                                             
                            
                     












































                                                                              
                                                                                                                  
















                                                                                  
                                                      

                                             

                         

                            
                     


























                                                                       
                                                           
                                      





                                                                             

                                          









                                                                           
                                                              
                                           
                                       
                                     
                                                                    
                                                     
                                       
                                            























                                                                              
                                       



































                                                                                           
                                             


                                        
                                                          
                                                
                                                             














                                                                                
                                                   
                                                                
                                                      


















                                                                         
                        
                           
                       






                                    
                          
                      
                            








                                                                       
                                                 
                                               
                                                            





                                                                    
                                                                   



                                    
                                                     


                                    
                                                              







                                                                                  
                                                  




                                                                               

                                                         
                                                           


                                            
                                          

                                              
                                                                           




































                                                                                      
                                                                









                                                                         
                                                    
                                                       
                                             

                            
                  




                                                                                   
                   








































                                                                      
                                                              

























































































                                                                                               

                                                                 


                                                                                  
                                            







                                                          
                                      

         
                                                    

                                      
                                                             

                                                                               






                                                      
                                                    


















                                                                                 

                                                      

                                                               
                                                                       











                                                      
                                                                                                          




































                                                                                     
                                             






































                                                                           
                                                     















































                                                                                          
                                                                  


                                          

                                          











                                                                              
                                                              



                                                                       
                                                    















                                                                 
 





















                                                                                         
                                               
                                                                   
                                     





                                                         
                                                
           
 








                                                 
 




                                             
                                               












                                                                                    
                                                   

                                              
                                                                            





                                              
                                               























                                                                           
                                                                          










                                                    
                               

                                              
                                  




























































                                                                              
                                                              
 
                                                    
























































                                                                            














                                                              



                                                                         


                                




                        
/*
 *	Linux NET3:	GRE over IP protocol decoder.
 *
 *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 */

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>

#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ipip.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>

#ifdef CONFIG_IPV6
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif

/*
   Problems & solutions
   --------------------

   1. The most important issue is detecting local dead loops.
   They would cause complete host lockup in transmit, which
   would be "resolved" by stack overflow or, if queueing is enabled,
   with infinite looping in net_bh.

   We cannot track such dead loops during route installation,
   it is infeasible task. The most general solutions would be
   to keep skb->encapsulation counter (sort of local ttl),
   and silently drop packet when it expires. It is the best
   solution, but it supposes maintaing new variable in ALL
   skb, even if no tunneling is used.

   Current solution: t->recursion lock breaks dead loops. It looks
   like dev->tbusy flag, but I preferred new variable, because
   the semantics is different. One day, when hard_start_xmit
   will be multithreaded we will have to use skb->encapsulation.



   2. Networking dead loops would not kill routers, but would really
   kill network. IP hop limit plays role of "t->recursion" in this case,
   if we copy it from packet being encapsulated to upper header.
   It is very good solution, but it introduces two problems:

   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
     do not work over tunnels.
   - traceroute does not work. I planned to relay ICMP from tunnel,
     so that this problem would be solved and traceroute output
     would even more informative. This idea appeared to be wrong:
     only Linux complies to rfc1812 now (yes, guys, Linux is the only
     true router now :-)), all routers (at least, in neighbourhood of mine)
     return only 8 bytes of payload. It is the end.

   Hence, if we want that OSPF worked or traceroute said something reasonable,
   we should search for another solution.

   One of them is to parse packet trying to detect inner encapsulation
   made by our node. It is difficult or even impossible, especially,
   taking into account fragmentation. TO be short, tt is not solution at all.

   Current solution: The solution was UNEXPECTEDLY SIMPLE.
   We force DF flag on tunnels with preconfigured hop limit,
   that is ALL. :-) Well, it does not remove the problem completely,
   but exponential growth of network traffic is changed to linear
   (branches, that exceed pmtu are pruned) and tunnel mtu
   fastly degrades to value <68, where looping stops.
   Yes, it is not good if there exists a router in the loop,
   which does not force DF, even when encapsulating packets have DF set.
   But it is not our problem! Nobody could accuse us, we made
   all that we could make. Even if it is your gated who injected
   fatal route to network, even if it were you who configured
   fatal static route: you are innocent. :-)



   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
   practically identical code. It would be good to glue them
   together, but it is not very evident, how to make them modular.
   sit is integral part of IPv6, ipip and gre are naturally modular.
   We could extract common parts (hash table, ioctl etc)
   to a separate module (ip_tunnel.c).

   Alexey Kuznetsov.
 */

static int ipgre_tunnel_init(struct net_device *dev);
static void ipgre_tunnel_setup(struct net_device *dev);

/* Fallback tunnel: no source, no destination, no key, no options */

static int ipgre_fb_tunnel_init(struct net_device *dev);

static struct net_device *ipgre_fb_tunnel_dev;

/* Tunnel hash table */

/*
   4 hash tables:

   3: (remote,local)
   2: (remote,*)
   1: (*,local)
   0: (*,*)

   We require exact key match i.e. if a key is present in packet
   it will match only tunnel with the same key; if it is not present,
   it will match only keyless tunnel.

   All keysless packets, if not matched configured keyless tunnels
   will match fallback tunnel.
 */

#define HASH_SIZE  16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)

static struct ip_tunnel *tunnels[4][HASH_SIZE];

#define tunnels_r_l	(tunnels[3])
#define tunnels_r	(tunnels[2])
#define tunnels_l	(tunnels[1])
#define tunnels_wc	(tunnels[0])

static DEFINE_RWLOCK(ipgre_lock);

/* Given src, dst and key, find appropriate for input tunnel. */

static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be32 key)
{
	unsigned h0 = HASH(remote);
	unsigned h1 = HASH(key);
	struct ip_tunnel *t;

	for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
				return t;
		}
	}
	for (t = tunnels_r[h0^h1]; t; t = t->next) {
		if (remote == t->parms.iph.daddr) {
			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
				return t;
		}
	}
	for (t = tunnels_l[h1]; t; t = t->next) {
		if (local == t->parms.iph.saddr ||
		     (local == t->parms.iph.daddr && MULTICAST(local))) {
			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
				return t;
		}
	}
	for (t = tunnels_wc[h1]; t; t = t->next) {
		if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
			return t;
	}

	if (ipgre_fb_tunnel_dev->flags&IFF_UP)
		return netdev_priv(ipgre_fb_tunnel_dev);
	return NULL;
}

static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms)
{
	__be32 remote = parms->iph.daddr;
	__be32 local = parms->iph.saddr;
	__be32 key = parms->i_key;
	unsigned h = HASH(key);
	int prio = 0;

	if (local)
		prio |= 1;
	if (remote && !MULTICAST(remote)) {
		prio |= 2;
		h ^= HASH(remote);
	}

	return &tunnels[prio][h];
}

static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
{
	return __ipgre_bucket(&t->parms);
}

static void ipgre_tunnel_link(struct ip_tunnel *t)
{
	struct ip_tunnel **tp = ipgre_bucket(t);

	t->next = *tp;
	write_lock_bh(&ipgre_lock);
	*tp = t;
	write_unlock_bh(&ipgre_lock);
}

static void ipgre_tunnel_unlink(struct ip_tunnel *t)
{
	struct ip_tunnel **tp;

	for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
		if (t == *tp) {
			write_lock_bh(&ipgre_lock);
			*tp = t->next;
			write_unlock_bh(&ipgre_lock);
			break;
		}
	}
}

static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
{
	__be32 remote = parms->iph.daddr;
	__be32 local = parms->iph.saddr;
	__be32 key = parms->i_key;
	struct ip_tunnel *t, **tp, *nt;
	struct net_device *dev;
	char name[IFNAMSIZ];

	for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
			if (key == t->parms.i_key)
				return t;
		}
	}
	if (!create)
		return NULL;

	if (parms->name[0])
		strlcpy(name, parms->name, IFNAMSIZ);
	else {
		int i;
		for (i=1; i<100; i++) {
			sprintf(name, "gre%d", i);
			if (__dev_get_by_name(name) == NULL)
				break;
		}
		if (i==100)
			goto failed;
	}

	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
	if (!dev)
	  return NULL;

	dev->init = ipgre_tunnel_init;
	nt = netdev_priv(dev);
	nt->parms = *parms;

	if (register_netdevice(dev) < 0) {
		free_netdev(dev);
		goto failed;
	}

	dev_hold(dev);
	ipgre_tunnel_link(nt);
	return nt;

failed:
	return NULL;
}

static void ipgre_tunnel_uninit(struct net_device *dev)
{
	ipgre_tunnel_unlink(netdev_priv(dev));
	dev_put(dev);
}


static void ipgre_err(struct sk_buff *skb, u32 info)
{
#ifndef I_WISH_WORLD_WERE_PERFECT

/* It is not :-( All the routers (except for Linux) return only
   8 bytes of packet payload. It means, that precise relaying of
   ICMP in the real Internet is absolutely infeasible.

   Moreover, Cisco "wise men" put GRE key to the third word
   in GRE header. It makes impossible maintaining even soft state for keyed
   GRE tunnels with enabled checksum. Tell them "thank you".

   Well, I wonder, rfc1812 was written by Cisco employee,
   what the hell these idiots break standrads established
   by themself???
 */

	struct iphdr *iph = (struct iphdr*)skb->data;
	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
	int grehlen = (iph->ihl<<2) + 4;
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
	struct ip_tunnel *t;
	__be16 flags;

	flags = p[0];
	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
		if (flags&(GRE_VERSION|GRE_ROUTING))
			return;
		if (flags&GRE_KEY) {
			grehlen += 4;
			if (flags&GRE_CSUM)
				grehlen += 4;
		}
	}

	/* If only 8 bytes returned, keyed message will be dropped here */
	if (skb_headlen(skb) < grehlen)
		return;

	switch (type) {
	default:
	case ICMP_PARAMETERPROB:
		return;

	case ICMP_DEST_UNREACH:
		switch (code) {
		case ICMP_SR_FAILED:
		case ICMP_PORT_UNREACH:
			/* Impossible event. */
			return;
		case ICMP_FRAG_NEEDED:
			/* Soft state for pmtu is maintained by IP core. */
			return;
		default:
			/* All others are translated to HOST_UNREACH.
			   rfc2003 contains "deep thoughts" about NET_UNREACH,
			   I believe they are just ether pollution. --ANK
			 */
			break;
		}
		break;
	case ICMP_TIME_EXCEEDED:
		if (code != ICMP_EXC_TTL)
			return;
		break;
	}

	read_lock(&ipgre_lock);
	t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0);
	if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
		goto out;

	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
		goto out;

	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
		t->err_count++;
	else
		t->err_count = 1;
	t->err_time = jiffies;
out:
	read_unlock(&ipgre_lock);
	return;
#else
	struct iphdr *iph = (struct iphdr*)dp;
	struct iphdr *eiph;
	__be16	     *p = (__be16*)(dp+(iph->ihl<<2));
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
	int rel_type = 0;
	int rel_code = 0;
	__be32 rel_info = 0;
	__u32 n = 0;
	__be16 flags;
	int grehlen = (iph->ihl<<2) + 4;
	struct sk_buff *skb2;
	struct flowi fl;
	struct rtable *rt;

	if (p[1] != htons(ETH_P_IP))
		return;

	flags = p[0];
	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
		if (flags&(GRE_VERSION|GRE_ROUTING))
			return;
		if (flags&GRE_CSUM)
			grehlen += 4;
		if (flags&GRE_KEY)
			grehlen += 4;
		if (flags&GRE_SEQ)
			grehlen += 4;
	}
	if (len < grehlen + sizeof(struct iphdr))
		return;
	eiph = (struct iphdr*)(dp + grehlen);

	switch (type) {
	default:
		return;
	case ICMP_PARAMETERPROB:
		n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
		if (n < (iph->ihl<<2))
			return;

		/* So... This guy found something strange INSIDE encapsulated
		   packet. Well, he is fool, but what can we do ?
		 */
		rel_type = ICMP_PARAMETERPROB;
		n -= grehlen;
		rel_info = htonl(n << 24);
		break;

	case ICMP_DEST_UNREACH:
		switch (code) {
		case ICMP_SR_FAILED:
		case ICMP_PORT_UNREACH:
			/* Impossible event. */
			return;
		case ICMP_FRAG_NEEDED:
			/* And it is the only really necessary thing :-) */
			n = ntohs(icmp_hdr(skb)->un.frag.mtu);
			if (n < grehlen+68)
				return;
			n -= grehlen;
			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
			if (n > ntohs(eiph->tot_len))
				return;
			rel_info = htonl(n);
			break;
		default:
			/* All others are translated to HOST_UNREACH.
			   rfc2003 contains "deep thoughts" about NET_UNREACH,
			   I believe, it is just ether pollution. --ANK
			 */
			rel_type = ICMP_DEST_UNREACH;
			rel_code = ICMP_HOST_UNREACH;
			break;
		}
		break;
	case ICMP_TIME_EXCEEDED:
		if (code != ICMP_EXC_TTL)
			return;
		break;
	}

	/* Prepare fake skb to feed it to icmp_send */
	skb2 = skb_clone(skb, GFP_ATOMIC);
	if (skb2 == NULL)
		return;
	dst_release(skb2->dst);
	skb2->dst = NULL;
	skb_pull(skb2, skb->data - (u8*)eiph);
	skb_reset_network_header(skb2);

	/* Try to guess incoming interface */
	memset(&fl, 0, sizeof(fl));
	fl.fl4_dst = eiph->saddr;
	fl.fl4_tos = RT_TOS(eiph->tos);
	fl.proto = IPPROTO_GRE;
	if (ip_route_output_key(&rt, &fl)) {
		kfree_skb(skb2);
		return;
	}
	skb2->dev = rt->u.dst.dev;

	/* route "incoming" packet */
	if (rt->rt_flags&RTCF_LOCAL) {
		ip_rt_put(rt);
		rt = NULL;
		fl.fl4_dst = eiph->daddr;
		fl.fl4_src = eiph->saddr;
		fl.fl4_tos = eiph->tos;
		if (ip_route_output_key(&rt, &fl) ||
		    rt->u.dst.dev->type != ARPHRD_IPGRE) {
			ip_rt_put(rt);
			kfree_skb(skb2);
			return;
		}
	} else {
		ip_rt_put(rt);
		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
		    skb2->dst->dev->type != ARPHRD_IPGRE) {
			kfree_skb(skb2);
			return;
		}
	}

	/* change mtu on this route */
	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
		if (n > dst_mtu(skb2->dst)) {
			kfree_skb(skb2);
			return;
		}
		skb2->dst->ops->update_pmtu(skb2->dst, n);
	} else if (type == ICMP_TIME_EXCEEDED) {
		struct ip_tunnel *t = netdev_priv(skb2->dev);
		if (t->parms.iph.ttl) {
			rel_type = ICMP_DEST_UNREACH;
			rel_code = ICMP_HOST_UNREACH;
		}
	}

	icmp_send(skb2, rel_type, rel_code, rel_info);
	kfree_skb(skb2);
#endif
}

static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
{
	if (INET_ECN_is_ce(iph->tos)) {
		if (skb->protocol == htons(ETH_P_IP)) {
			IP_ECN_set_ce(ip_hdr(skb));
		} else if (skb->protocol == htons(ETH_P_IPV6)) {
			IP6_ECN_set_ce(ipv6_hdr(skb));
		}
	}
}

static inline u8
ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
{
	u8 inner = 0;
	if (skb->protocol == htons(ETH_P_IP))
		inner = old_iph->tos;
	else if (skb->protocol == htons(ETH_P_IPV6))
		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
	return INET_ECN_encapsulate(tos, inner);
}

static int ipgre_rcv(struct sk_buff *skb)
{
	struct iphdr *iph;
	u8     *h;
	__be16    flags;
	__sum16   csum = 0;
	__be32 key = 0;
	u32    seqno = 0;
	struct ip_tunnel *tunnel;
	int    offset = 4;

	if (!pskb_may_pull(skb, 16))
		goto drop_nolock;

	iph = ip_hdr(skb);
	h = skb->data;
	flags = *(__be16*)h;

	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
		/* - Version must be 0.
		   - We do not support routing headers.
		 */
		if (flags&(GRE_VERSION|GRE_ROUTING))
			goto drop_nolock;

		if (flags&GRE_CSUM) {
			switch (skb->ip_summed) {
			case CHECKSUM_COMPLETE:
				csum = csum_fold(skb->csum);
				if (!csum)
					break;
				/* fall through */
			case CHECKSUM_NONE:
				skb->csum = 0;
				csum = __skb_checksum_complete(skb);
				skb->ip_summed = CHECKSUM_COMPLETE;
			}
			offset += 4;
		}
		if (flags&GRE_KEY) {
			key = *(__be32*)(h + offset);
			offset += 4;
		}
		if (flags&GRE_SEQ) {
			seqno = ntohl(*(__be32*)(h + offset));
			offset += 4;
		}
	}

	read_lock(&ipgre_lock);
	if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
		secpath_reset(skb);

		skb->protocol = *(__be16*)(h + 2);
		/* WCCP version 1 and 2 protocol decoding.
		 * - Change protocol to IP
		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
		 */
		if (flags == 0 &&
		    skb->protocol == htons(ETH_P_WCCP)) {
			skb->protocol = htons(ETH_P_IP);
			if ((*(h + offset) & 0xF0) != 0x40)
				offset += 4;
		}

		skb_reset_mac_header(skb);
		__pskb_pull(skb, offset);
		skb_reset_network_header(skb);
		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
		skb->pkt_type = PACKET_HOST;
#ifdef CONFIG_NET_IPGRE_BROADCAST
		if (MULTICAST(iph->daddr)) {
			/* Looped back packet, drop it! */
			if (((struct rtable*)skb->dst)->fl.iif == 0)
				goto drop;
			tunnel->stat.multicast++;
			skb->pkt_type = PACKET_BROADCAST;
		}
#endif

		if (((flags&GRE_CSUM) && csum) ||
		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
			tunnel->stat.rx_crc_errors++;
			tunnel->stat.rx_errors++;
			goto drop;
		}
		if (tunnel->parms.i_flags&GRE_SEQ) {
			if (!(flags&GRE_SEQ) ||
			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
				tunnel->stat.rx_fifo_errors++;
				tunnel->stat.rx_errors++;
				goto drop;
			}
			tunnel->i_seqno = seqno + 1;
		}
		tunnel->stat.rx_packets++;
		tunnel->stat.rx_bytes += skb->len;
		skb->dev = tunnel->dev;
		dst_release(skb->dst);
		skb->dst = NULL;
		nf_reset(skb);
		ipgre_ecn_decapsulate(iph, skb);
		netif_rx(skb);
		read_unlock(&ipgre_lock);
		return(0);
	}
	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

drop:
	read_unlock(&ipgre_lock);
drop_nolock:
	kfree_skb(skb);
	return(0);
}

static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip_tunnel *tunnel = netdev_priv(dev);
	struct net_device_stats *stats = &tunnel->stat;
	struct iphdr  *old_iph = ip_hdr(skb);
	struct iphdr  *tiph;
	u8     tos;
	__be16 df;
	struct rtable *rt;     			/* Route to the other host */
	struct net_device *tdev;			/* Device to other host */
	struct iphdr  *iph;			/* Our new IP header */
	int    max_headroom;			/* The extra header space needed */
	int    gre_hlen;
	__be32 dst;
	int    mtu;

	if (tunnel->recursion++) {
		tunnel->stat.collisions++;
		goto tx_error;
	}

	if (dev->hard_header) {
		gre_hlen = 0;
		tiph = (struct iphdr*)skb->data;
	} else {
		gre_hlen = tunnel->hlen;
		tiph = &tunnel->parms.iph;
	}

	if ((dst = tiph->daddr) == 0) {
		/* NBMA tunnel */

		if (skb->dst == NULL) {
			tunnel->stat.tx_fifo_errors++;
			goto tx_error;
		}

		if (skb->protocol == htons(ETH_P_IP)) {
			rt = (struct rtable*)skb->dst;
			if ((dst = rt->rt_gateway) == 0)
				goto tx_error_icmp;
		}
#ifdef CONFIG_IPV6
		else if (skb->protocol == htons(ETH_P_IPV6)) {
			struct in6_addr *addr6;
			int addr_type;
			struct neighbour *neigh = skb->dst->neighbour;

			if (neigh == NULL)
				goto tx_error;

			addr6 = (struct in6_addr*)&neigh->primary_key;
			addr_type = ipv6_addr_type(addr6);

			if (addr_type == IPV6_ADDR_ANY) {
				addr6 = &ipv6_hdr(skb)->daddr;
				addr_type = ipv6_addr_type(addr6);
			}

			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
				goto tx_error_icmp;

			dst = addr6->s6_addr32[3];
		}
#endif
		else
			goto tx_error;
	}

	tos = tiph->tos;
	if (tos&1) {
		if (skb->protocol == htons(ETH_P_IP))
			tos = old_iph->tos;
		tos &= ~1;
	}

	{
		struct flowi fl = { .oif = tunnel->parms.link,
				    .nl_u = { .ip4_u =
					      { .daddr = dst,
						.saddr = tiph->saddr,
						.tos = RT_TOS(tos) } },
				    .proto = IPPROTO_GRE };
		if (ip_route_output_key(&rt, &fl)) {
			tunnel->stat.tx_carrier_errors++;
			goto tx_error;
		}
	}
	tdev = rt->u.dst.dev;

	if (tdev == dev) {
		ip_rt_put(rt);
		tunnel->stat.collisions++;
		goto tx_error;
	}

	df = tiph->frag_off;
	if (df)
		mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
	else
		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;

	if (skb->dst)
		skb->dst->ops->update_pmtu(skb->dst, mtu);

	if (skb->protocol == htons(ETH_P_IP)) {
		df |= (old_iph->frag_off&htons(IP_DF));

		if ((old_iph->frag_off&htons(IP_DF)) &&
		    mtu < ntohs(old_iph->tot_len)) {
			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
			ip_rt_put(rt);
			goto tx_error;
		}
	}
#ifdef CONFIG_IPV6
	else if (skb->protocol == htons(ETH_P_IPV6)) {
		struct rt6_info *rt6 = (struct rt6_info*)skb->dst;

		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
			if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
			    rt6->rt6i_dst.plen == 128) {
				rt6->rt6i_flags |= RTF_MODIFIED;
				skb->dst->metrics[RTAX_MTU-1] = mtu;
			}
		}

		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
			ip_rt_put(rt);
			goto tx_error;
		}
	}
#endif

	if (tunnel->err_count > 0) {
		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
			tunnel->err_count--;

			dst_link_failure(skb);
		} else
			tunnel->err_count = 0;
	}

	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;

	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
		if (!new_skb) {
			ip_rt_put(rt);
			stats->tx_dropped++;
			dev_kfree_skb(skb);
			tunnel->recursion--;
			return 0;
		}
		if (skb->sk)
			skb_set_owner_w(new_skb, skb->sk);
		dev_kfree_skb(skb);
		skb = new_skb;
		old_iph = ip_hdr(skb);
	}

	skb->transport_header = skb->network_header;
	skb_push(skb, gre_hlen);
	skb_reset_network_header(skb);
	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
			      IPSKB_REROUTED);
	dst_release(skb->dst);
	skb->dst = &rt->u.dst;

	/*
	 *	Push down and install the IPIP header.
	 */

	iph 			=	ip_hdr(skb);
	iph->version		=	4;
	iph->ihl		=	sizeof(struct iphdr) >> 2;
	iph->frag_off		=	df;
	iph->protocol		=	IPPROTO_GRE;
	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
	iph->daddr		=	rt->rt_dst;
	iph->saddr		=	rt->rt_src;

	if ((iph->ttl = tiph->ttl) == 0) {
		if (skb->protocol == htons(ETH_P_IP))
			iph->ttl = old_iph->ttl;
#ifdef CONFIG_IPV6
		else if (skb->protocol == htons(ETH_P_IPV6))
			iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
#endif
		else
			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
	}

	((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
	((__be16*)(iph+1))[1] = skb->protocol;

	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);

		if (tunnel->parms.o_flags&GRE_SEQ) {
			++tunnel->o_seqno;
			*ptr = htonl(tunnel->o_seqno);
			ptr--;
		}
		if (tunnel->parms.o_flags&GRE_KEY) {
			*ptr = tunnel->parms.o_key;
			ptr--;
		}
		if (tunnel->parms.o_flags&GRE_CSUM) {
			*ptr = 0;
			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
		}
	}

	nf_reset(skb);

	IPTUNNEL_XMIT();
	tunnel->recursion--;
	return 0;

tx_error_icmp:
	dst_link_failure(skb);

tx_error:
	stats->tx_errors++;
	dev_kfree_skb(skb);
	tunnel->recursion--;
	return 0;
}

static int
ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
{
	int err = 0;
	struct ip_tunnel_parm p;
	struct ip_tunnel *t;

	switch (cmd) {
	case SIOCGETTUNNEL:
		t = NULL;
		if (dev == ipgre_fb_tunnel_dev) {
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
				err = -EFAULT;
				break;
			}
			t = ipgre_tunnel_locate(&p, 0);
		}
		if (t == NULL)
			t = netdev_priv(dev);
		memcpy(&p, &t->parms, sizeof(p));
		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
			err = -EFAULT;
		break;

	case SIOCADDTUNNEL:
	case SIOCCHGTUNNEL:
		err = -EPERM;
		if (!capable(CAP_NET_ADMIN))
			goto done;

		err = -EFAULT;
		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
			goto done;

		err = -EINVAL;
		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
			goto done;
		if (p.iph.ttl)
			p.iph.frag_off |= htons(IP_DF);

		if (!(p.i_flags&GRE_KEY))
			p.i_key = 0;
		if (!(p.o_flags&GRE_KEY))
			p.o_key = 0;

		t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);

		if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
			if (t != NULL) {
				if (t->dev != dev) {
					err = -EEXIST;
					break;
				}
			} else {
				unsigned nflags=0;

				t = netdev_priv(dev);

				if (MULTICAST(p.iph.daddr))
					nflags = IFF_BROADCAST;
				else if (p.iph.daddr)
					nflags = IFF_POINTOPOINT;

				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
					err = -EINVAL;
					break;
				}
				ipgre_tunnel_unlink(t);
				t->parms.iph.saddr = p.iph.saddr;
				t->parms.iph.daddr = p.iph.daddr;
				t->parms.i_key = p.i_key;
				t->parms.o_key = p.o_key;
				memcpy(dev->dev_addr, &p.iph.saddr, 4);
				memcpy(dev->broadcast, &p.iph.daddr, 4);
				ipgre_tunnel_link(t);
				netdev_state_change(dev);
			}
		}

		if (t) {
			err = 0;
			if (cmd == SIOCCHGTUNNEL) {
				t->parms.iph.ttl = p.iph.ttl;
				t->parms.iph.tos = p.iph.tos;
				t->parms.iph.frag_off = p.iph.frag_off;
			}
			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
				err = -EFAULT;
		} else
			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
		break;

	case SIOCDELTUNNEL:
		err = -EPERM;
		if (!capable(CAP_NET_ADMIN))
			goto done;

		if (dev == ipgre_fb_tunnel_dev) {
			err = -EFAULT;
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
				goto done;
			err = -ENOENT;
			if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
				goto done;
			err = -EPERM;
			if (t == netdev_priv(ipgre_fb_tunnel_dev))
				goto done;
			dev = t->dev;
		}
		unregister_netdevice(dev);
		err = 0;
		break;

	default:
		err = -EINVAL;
	}

done:
	return err;
}

static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
{
	return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
}

static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
{
	struct ip_tunnel *tunnel = netdev_priv(dev);
	if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
		return -EINVAL;
	dev->mtu = new_mtu;
	return 0;
}

#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
   It allows to construct virtual multiprotocol broadcast "LAN"
   over the Internet, provided multicast routing is tuned.


   I have no idea was this bicycle invented before me,
   so that I had to set ARPHRD_IPGRE to a random value.
   I have an impression, that Cisco could make something similar,
   but this feature is apparently missing in IOS<=11.2(8).

   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)

   ping -t 255 224.66.66.66

   If nobody answers, mbone does not work.

   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
   ip addr add 10.66.66.<somewhat>/24 dev Universe
   ifconfig Universe up
   ifconfig Universe add fe80::<Your_real_addr>/10
   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
   ftp 10.66.66.66
   ...
   ftp fec0:6666:6666::193.233.7.65
   ...

 */

static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
			void *daddr, void *saddr, unsigned len)
{
	struct ip_tunnel *t = netdev_priv(dev);
	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
	__be16 *p = (__be16*)(iph+1);

	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
	p[0]		= t->parms.o_flags;
	p[1]		= htons(type);

	/*
	 *	Set the source hardware address.
	 */

	if (saddr)
		memcpy(&iph->saddr, saddr, 4);

	if (daddr) {
		memcpy(&iph->daddr, daddr, 4);
		return t->hlen;
	}
	if (iph->daddr && !MULTICAST(iph->daddr))
		return t->hlen;

	return -t->hlen;
}

static int ipgre_open(struct net_device *dev)
{
	struct ip_tunnel *t = netdev_priv(dev);

	if (MULTICAST(t->parms.iph.daddr)) {
		struct flowi fl = { .oif = t->parms.link,
				    .nl_u = { .ip4_u =
					      { .daddr = t->parms.iph.daddr,
						.saddr = t->parms.iph.saddr,
						.tos = RT_TOS(t->parms.iph.tos) } },
				    .proto = IPPROTO_GRE };
		struct rtable *rt;
		if (ip_route_output_key(&rt, &fl))
			return -EADDRNOTAVAIL;
		dev = rt->u.dst.dev;
		ip_rt_put(rt);
		if (__in_dev_get_rtnl(dev) == NULL)
			return -EADDRNOTAVAIL;
		t->mlink = dev->ifindex;
		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
	}
	return 0;
}

static int ipgre_close(struct net_device *dev)
{
	struct ip_tunnel *t = netdev_priv(dev);
	if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
		struct in_device *in_dev = inetdev_by_index(t->mlink);
		if (in_dev) {
			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
			in_dev_put(in_dev);
		}
	}
	return 0;
}

#endif

static void ipgre_tunnel_setup(struct net_device *dev)
{
	SET_MODULE_OWNER(dev);
	dev->uninit		= ipgre_tunnel_uninit;
	dev->destructor 	= free_netdev;
	dev->hard_start_xmit	= ipgre_tunnel_xmit;
	dev->get_stats		= ipgre_tunnel_get_stats;
	dev->do_ioctl		= ipgre_tunnel_ioctl;
	dev->change_mtu		= ipgre_tunnel_change_mtu;

	dev->type		= ARPHRD_IPGRE;
	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
	dev->flags		= IFF_NOARP;
	dev->iflink		= 0;
	dev->addr_len		= 4;
}

static int ipgre_tunnel_init(struct net_device *dev)
{
	struct net_device *tdev = NULL;
	struct ip_tunnel *tunnel;
	struct iphdr *iph;
	int hlen = LL_MAX_HEADER;
	int mtu = ETH_DATA_LEN;
	int addend = sizeof(struct iphdr) + 4;

	tunnel = netdev_priv(dev);
	iph = &tunnel->parms.iph;

	tunnel->dev = dev;
	strcpy(tunnel->parms.name, dev->name);

	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);

	/* Guess output device to choose reasonable mtu and hard_header_len */

	if (iph->daddr) {
		struct flowi fl = { .oif = tunnel->parms.link,
				    .nl_u = { .ip4_u =
					      { .daddr = iph->daddr,
						.saddr = iph->saddr,
						.tos = RT_TOS(iph->tos) } },
				    .proto = IPPROTO_GRE };
		struct rtable *rt;
		if (!ip_route_output_key(&rt, &fl)) {
			tdev = rt->u.dst.dev;
			ip_rt_put(rt);
		}

		dev->flags |= IFF_POINTOPOINT;

#ifdef CONFIG_NET_IPGRE_BROADCAST
		if (MULTICAST(iph->daddr)) {
			if (!iph->saddr)
				return -EINVAL;
			dev->flags = IFF_BROADCAST;
			dev->hard_header = ipgre_header;
			dev->open = ipgre_open;
			dev->stop = ipgre_close;
		}
#endif
	}

	if (!tdev && tunnel->parms.link)
		tdev = __dev_get_by_index(tunnel->parms.link);

	if (tdev) {
		hlen = tdev->hard_header_len;
		mtu = tdev->mtu;
	}
	dev->iflink = tunnel->parms.link;

	/* Precalculate GRE options length */
	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
		if (tunnel->parms.o_flags&GRE_CSUM)
			addend += 4;
		if (tunnel->parms.o_flags&GRE_KEY)
			addend += 4;
		if (tunnel->parms.o_flags&GRE_SEQ)
			addend += 4;
	}
	dev->hard_header_len = hlen + addend;
	dev->mtu = mtu - addend;
	tunnel->hlen = addend;
	return 0;
}

static int __init ipgre_fb_tunnel_init(struct net_device *dev)
{
	struct ip_tunnel *tunnel = netdev_priv(dev);
	struct iphdr *iph = &tunnel->parms.iph;

	tunnel->dev = dev;
	strcpy(tunnel->parms.name, dev->name);

	iph->version		= 4;
	iph->protocol		= IPPROTO_GRE;
	iph->ihl		= 5;
	tunnel->hlen		= sizeof(struct iphdr) + 4;

	dev_hold(dev);
	tunnels_wc[0]		= tunnel;
	return 0;
}


static struct net_protocol ipgre_protocol = {
	.handler	=	ipgre_rcv,
	.err_handler	=	ipgre_err,
};


/*
 *	And now the modules code and kernel interface.
 */

static int __init ipgre_init(void)
{
	int err;

	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");

	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
		printk(KERN_INFO "ipgre init: can't add protocol\n");
		return -EAGAIN;
	}

	ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
					   ipgre_tunnel_setup);
	if (!ipgre_fb_tunnel_dev) {
		err = -ENOMEM;
		goto err1;
	}

	ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;

	if ((err = register_netdev(ipgre_fb_tunnel_dev)))
		goto err2;
out:
	return err;
err2:
	free_netdev(ipgre_fb_tunnel_dev);
err1:
	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
	goto out;
}

static void __exit ipgre_destroy_tunnels(void)
{
	int prio;

	for (prio = 0; prio < 4; prio++) {
		int h;
		for (h = 0; h < HASH_SIZE; h++) {
			struct ip_tunnel *t;
			while ((t = tunnels[prio][h]) != NULL)
				unregister_netdevice(t->dev);
		}
	}
}

static void __exit ipgre_fini(void)
{
	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
		printk(KERN_INFO "ipgre close: can't remove protocol\n");

	rtnl_lock();
	ipgre_destroy_tunnels();
	rtnl_unlock();
}

module_init(ipgre_init);
module_exit(ipgre_fini);
MODULE_LICENSE("GPL");
="hl opt">, (unsigned long)2); XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); iclog->ic_bwritecnt = 2; /* split into 2 writes */ } else { iclog->ic_bwritecnt = 1; } XFS_BUF_SET_COUNT(bp, count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); /* * Do an ordered write for the log block. * Its unnecessary to flush the first split block in the log wrap case. */ if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); xlog_verify_iclog(log, iclog, count, B_TRUE); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); /* * Don't call xfs_bwrite here. We do log-syncs even when the filesystem * is shutting down. */ XFS_BUF_WRITE(bp); if ((error = XFS_bwrite(bp))) { xfs_ioerror_alert("xlog_sync", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; } if (split) { bp = iclog->ic_log->l_xbuf; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ (__psint_t)count), split); XFS_BUF_SET_FSPRIVATE(bp, iclog); XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) XFS_BUF_ORDERED(bp); dptr = XFS_BUF_PTR(bp); /* * Bump the cycle numbers at the start of each block * since this part of the buffer is at the start of * a new cycle. Watch out for the header magic number * case, though. */ for (i = 0; i < split; i += BBSIZE) { be32_add_cpu((__be32 *)dptr, 1); if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) be32_add_cpu((__be32 *)dptr, 1); dptr += BBSIZE; } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); if ((error = XFS_bwrite(bp))) { xfs_ioerror_alert("xlog_sync (split)", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; } } return 0; } /* xlog_sync */ /* * Deallocate a log structure */ STATIC void xlog_dealloc_log(xlog_t *log) { xlog_in_core_t *iclog, *next_iclog; int i; iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; } spinlock_destroy(&log->l_icloglock); spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ /* * Update counters atomically now that memcpy is done. */ /* ARGSUSED */ static inline void xlog_state_finish_copy(xlog_t *log, xlog_in_core_t *iclog, int record_cnt, int copy_bytes) { spin_lock(&log->l_icloglock); be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; spin_unlock(&log->l_icloglock); } /* xlog_state_finish_copy */ /* * print out info relating to regions written which consume * the reservation */ STATIC void xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); /* match with XLOG_REG_TYPE_* in xfs_log.h */ static char *res_type_str[XLOG_REG_TYPE_MAX] = { "bformat", "bchunk", "efi_format", "efd_format", "iformat", "icore", "iext", "ibroot", "ilocal", "iattr_ext", "iattr_broot", "iattr_local", "qformat", "dquot", "quotaoff", "LR header", "unmount", "commit", "trans header" }; static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { "SETATTR_NOT_SIZE", "SETATTR_SIZE", "INACTIVE", "CREATE", "CREATE_TRUNC", "TRUNCATE_FILE", "REMOVE", "LINK", "RENAME", "MKDIR", "RMDIR", "SYMLINK", "SET_DMATTRS", "GROWFS", "STRAT_WRITE", "DIOSTRAT", "WRITE_SYNC", "WRITEID", "ADDAFORK", "ATTRINVAL", "ATRUNCATE", "ATTR_SET", "ATTR_RM", "ATTR_FLAG", "CLEAR_AGI_BUCKET", "QM_SBCHANGE", "DUMMY1", "DUMMY2", "QM_QUOTAOFF", "QM_DQALLOC", "QM_SETQLIM", "QM_DQCLUSTER", "QM_QINOCREATE", "QM_QUOTAOFF_END", "SB_UNIT", "FSYNC_TS", "GROWFSRT_ALLOC", "GROWFSRT_ZERO", "GROWFSRT_FREE", "SWAPEXT" }; xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_write: reservation summary:\n" " trans type = %s (%u)\n" " unit res = %d bytes\n" " current res = %d bytes\n" " total reg = %u bytes (o/flow = %u bytes)\n" " ophdrs = %u (ophdr space = %u bytes)\n" " ophdr + reg = %u bytes\n" " num regions = %u\n", ((ticket->t_trans_type <= 0 || ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), ticket->t_trans_type, ticket->t_unit_res, ticket->t_curr_res, ticket->t_res_arr_sum, ticket->t_res_o_flow, ticket->t_res_num_ophdrs, ophdr_spc, ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc, ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; cmn_err(CE_WARN, "region[%u]: %s - %u bytes\n", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } } /* * Write some region out to in-core log * * This will be called when writing externally provided regions or when * writing out a commit record for a given transaction. * * General algorithm: * 1. Find total length of this write. This may include adding to the * lengths passed in. * 2. Check whether we violate the tickets reservation. * 3. While writing to this iclog * A. Reserve as much space in this iclog as can get * B. If this is first write, save away start lsn * C. While writing this region: * 1. If first write of transaction, write start record * 2. Write log operation header (header per region) * 3. Find out if we can fit entire region into this iclog * 4. Potentially, verify destination memcpy ptr * 5. Memcpy (partial) region * 6. If partial copy, release iclog; otherwise, continue * copying more regions into current iclog * 4. Mark want sync bit (in simulation mode) * 5. Release iclog for potential flush to on-disk log. * * ERRORS: * 1. Panic if reservation is overrun. This should never happen since * reservation amounts are generated internal to the filesystem. * NOTES: * 1. Tickets are single threaded data structures. * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the * syncing routine. When a single log_write region needs to span * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set * on all log operation writes which don't contain the end of the * region. The XLOG_END_TRANS bit is used for the in-core log * operation which contains the end of the continued log_write region. * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, * we don't really know exactly how much space will be used. As a result, * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ STATIC int xlog_write(xfs_mount_t * mp, xfs_log_iovec_t reg[], int nentries, xfs_log_ticket_t tic, xfs_lsn_t *start_lsn, xlog_in_core_t **commit_iclog, uint flags) { xlog_t *log = mp->m_log; xlog_ticket_t *ticket = (xlog_ticket_t *)tic; xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ xlog_op_header_t *logop_head; /* ptr to log operation header */ __psint_t ptr; /* copy address into data region */ int len; /* # xlog_write() bytes 2 still copy */ int index; /* region index currently copying */ int log_offset; /* offset (from 0) into data region */ int start_rec_copy; /* # bytes to copy for start record */ int partial_copy; /* did we split a region? */ int partial_copy_len;/* # bytes copied if split region */ int need_copy; /* # bytes need to memcpy this region */ int copy_len; /* # bytes actually memcpy'ing */ int copy_off; /* # bytes from entry start */ int contwr; /* continued write of in-core log? */ int error; int record_cnt = 0, data_cnt = 0; partial_copy_len = partial_copy = 0; /* Calculate potential maximum space. Each region gets its own * xlog_op_header_t and may need to be double word aligned. */ len = 0; if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ len += sizeof(xlog_op_header_t); ticket->t_res_num_ophdrs++; } for (index = 0; index < nentries; index++) { len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ ticket->t_res_num_ophdrs++; len += reg[index].i_len; xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type); } contwr = *start_lsn = 0; if (ticket->t_curr_res < len) { xlog_print_tic_res(mp, ticket); #ifdef DEBUG xlog_panic( "xfs_log_write: reservation ran out. Need to up reservation"); #else /* Customer configurable panic */ xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, "xfs_log_write: reservation ran out. Need to up reservation"); /* If we did not panic, shutdown the filesystem */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); #endif } else ticket->t_curr_res -= len; for (index = 0; index < nentries; ) { if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, &contwr, &log_offset))) return error; ASSERT(log_offset <= iclog->ic_size - 1); ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); /* start_lsn is the first lsn written to. That's all we need. */ if (! *start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); /* This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ while (index < nentries) { ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); start_rec_copy = 0; /* If first write for transaction, insert start record. * We can't be trying to commit if we are inited. We can't * have any "partial_copy" if we are inited. */ if (ticket->t_flags & XLOG_TIC_INITED) { logop_head = (xlog_op_header_t *)ptr; logop_head->oh_tid = cpu_to_be32(ticket->t_tid); logop_head->oh_clientid = ticket->t_clientid; logop_head->oh_len = 0; logop_head->oh_flags = XLOG_START_TRANS; logop_head->oh_res2 = 0; ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ record_cnt++; start_rec_copy = sizeof(xlog_op_header_t); xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); } /* Copy log operation header directly into data section */ logop_head = (xlog_op_header_t *)ptr; logop_head->oh_tid = cpu_to_be32(ticket->t_tid); logop_head->oh_clientid = ticket->t_clientid; logop_head->oh_res2 = 0; /* header copied directly */ xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); /* are we copying a commit or unmount record? */ logop_head->oh_flags = flags; /* * We've seen logs corrupted with bad transaction client * ids. This makes sure that XFS doesn't generate them on. * Turn this into an EIO and shut down the filesystem. */ switch (logop_head->oh_clientid) { case XFS_TRANSACTION: case XFS_VOLUME: case XFS_LOG: break; default: xfs_fs_cmn_err(CE_WARN, mp, "Bad XFS transaction clientid 0x%x in ticket 0x%p", logop_head->oh_clientid, tic); return XFS_ERROR(EIO); } /* Partial write last time? => (partial_copy != 0) * need_copy is the amount we'd like to copy if everything could * fit in the current memcpy. */ need_copy = reg[index].i_len - partial_copy_len; copy_off = partial_copy_len; if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ copy_len = need_copy; logop_head->oh_len = cpu_to_be32(copy_len); if (partial_copy) logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); partial_copy_len = partial_copy = 0; } else { /* partial write */ copy_len = iclog->ic_size - log_offset; logop_head->oh_len = cpu_to_be32(copy_len); logop_head->oh_flags |= XLOG_CONTINUE_TRANS; if (partial_copy) logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; partial_copy_len += copy_len; partial_copy++; len += sizeof(xlog_op_header_t); /* from splitting of region */ /* account for new log op header */ ticket->t_curr_res -= sizeof(xlog_op_header_t); ticket->t_res_num_ophdrs++; } xlog_verify_dest_ptr(log, ptr); /* copy region */ ASSERT(copy_len >= 0); memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); xlog_write_adv_cnt(ptr, len, log_offset, copy_len); /* make copy_len total bytes copied, including headers */ copy_len += start_rec_copy + sizeof(xlog_op_header_t); record_cnt++; data_cnt += contwr ? copy_len : 0; if (partial_copy) { /* copied partial region */ /* already marked WANT_SYNC by xlog_state_get_iclog_space */ xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); record_cnt = data_cnt = 0; if ((error = xlog_state_release_iclog(log, iclog))) return error; break; /* don't increment index */ } else { /* copied entire region */ index++; partial_copy_len = partial_copy = 0; if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); record_cnt = data_cnt = 0; spin_lock(&log->l_icloglock); xlog_state_want_sync(log, iclog); spin_unlock(&log->l_icloglock); if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } else if ((error = xlog_state_release_iclog(log, iclog))) return error; if (index == nentries) return 0; /* we are done */ else break; } } /* if (partial_copy) */ } /* while (index < nentries) */ } /* for (index = 0; index < nentries; ) */ ASSERT(len == 0); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; return 0; } return xlog_state_release_iclog(log, iclog); } /* xlog_write */ /***************************************************************************** * * State Machine functions * ***************************************************************************** */ /* Clean iclogs starting from the head. This ordering must be * maintained, so an iclog doesn't become ACTIVE beyond one that * is SYNCING. This is also required to maintain the notion that we use * a ordered wait queue to hold off would be writers to the log when every * iclog is trying to sync to disk. * * State Change: DIRTY -> ACTIVE */ STATIC void xlog_state_clean_log(xlog_t *log) { xlog_in_core_t *iclog; int changed = 0; iclog = log->l_iclog; do { if (iclog->ic_state == XLOG_STATE_DIRTY) { iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; ASSERT(iclog->ic_callback == NULL); /* * If the number of ops in this iclog indicate it just * contains the dummy transaction, we can * change state into IDLE (the second time around). * Otherwise we should change the state into * NEED a dummy. * We don't need to cover the dummy. */ if (!changed && (be32_to_cpu(iclog->ic_header.h_num_logops) == XLOG_COVER_OPS)) { changed = 1; } else { /* * We have two dirty iclogs so start over * This could also be num of ops indicates * this is not the dummy going out. */ changed = 2; } iclog->ic_header.h_num_logops = 0; memset(iclog->ic_header.h_cycle_data, 0, sizeof(iclog->ic_header.h_cycle_data)); iclog->ic_header.h_lsn = 0; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) /* do nothing */; else break; /* stop cleaning */ iclog = iclog->ic_next; } while (iclog != log->l_iclog); /* log is locked when we are called */ /* * Change state for the dummy log recording. * We usually go to NEED. But we go to NEED2 if the changed indicates * we are done writing the dummy record. * If we are done with the second dummy recored (DONE2), then * we go to IDLE. */ if (changed) { switch (log->l_covered_state) { case XLOG_STATE_COVER_IDLE: case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: log->l_covered_state = XLOG_STATE_COVER_NEED; break; case XLOG_STATE_COVER_DONE: if (changed == 1) log->l_covered_state = XLOG_STATE_COVER_NEED2; else log->l_covered_state = XLOG_STATE_COVER_NEED; break; case XLOG_STATE_COVER_DONE2: if (changed == 1) log->l_covered_state = XLOG_STATE_COVER_IDLE; else log->l_covered_state = XLOG_STATE_COVER_NEED; break; default: ASSERT(0); } } } /* xlog_state_clean_log */ STATIC xfs_lsn_t xlog_get_lowest_lsn( xlog_t *log) { xlog_in_core_t *lsn_log; xfs_lsn_t lowest_lsn, lsn; lsn_log = log->l_iclog; lowest_lsn = 0; do { if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); if ((lsn && !lowest_lsn) || (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { lowest_lsn = lsn; } } lsn_log = lsn_log->ic_next; } while (lsn_log != log->l_iclog); return lowest_lsn; } STATIC void xlog_state_do_callback( xlog_t *log, int aborted, xlog_in_core_t *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've * processed all iclogs once */ xfs_log_callback_t *cb, *cb_next; int flushcnt = 0; xfs_lsn_t lowest_lsn; int ioerrors; /* counter: iclogs with errors */ int loopdidcallbacks; /* flag: inner loop did callbacks*/ int funcdidcallbacks; /* flag: function did callbacks */ int repeats; /* for issuing console warnings if * looping too many times */ int wake = 0; spin_lock(&log->l_icloglock); first_iclog = iclog = log->l_iclog; ioerrors = 0; funcdidcallbacks = 0; repeats = 0; do { /* * Scan all iclogs starting with the one pointed to by the * log. Reset this starting point each time the log is * unlocked (during callbacks). * * Keep looping through iclogs until one full pass is made * without running any callbacks. */ first_iclog = log->l_iclog; iclog = log->l_iclog; loopdidcallbacks = 0; repeats++; do { /* skip all iclogs in the ACTIVE & DIRTY states */ if (iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { iclog = iclog->ic_next; continue; } /* * Between marking a filesystem SHUTDOWN and stopping * the log, we do flush all iclogs to disk (if there * wasn't a log I/O error). So, we do want things to * go smoothly in case of just a SHUTDOWN w/o a * LOG_IO_ERROR. */ if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { /* * Can only perform callbacks in order. Since * this iclog is not in the DONE_SYNC/ * DO_CALLBACK state, we skip the rest and * just try to clean up. If we set our iclog * to DO_CALLBACK, we will not process it when * we retry since a previous iclog is in the * CALLBACK and the state cannot change since * we are holding the l_icloglock. */ if (!(iclog->ic_state & (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { if (ciclog && (ciclog->ic_state == XLOG_STATE_DONE_SYNC)) { ciclog->ic_state = XLOG_STATE_DO_CALLBACK; } break; } /* * We now have an iclog that is in either the * DO_CALLBACK or DONE_SYNC states. The other * states (WANT_SYNC, SYNCING, or CALLBACK were * caught by the above if and are going to * clean (i.e. we aren't doing their callbacks) * see the above if. */ /* * We will do one more check here to see if we * have chased our tail around. */ lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { iclog = iclog->ic_next; continue; /* Leave this iclog for * another thread */ } iclog->ic_state = XLOG_STATE_CALLBACK; spin_unlock(&log->l_icloglock); /* l_last_sync_lsn field protected by * l_grant_lock. Don't worry about iclog's lsn. * No one else can be here except us. */ spin_lock(&log->l_grant_lock); ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); log->l_last_sync_lsn = be64_to_cpu(iclog->ic_header.h_lsn); spin_unlock(&log->l_grant_lock); } else { spin_unlock(&log->l_icloglock); ioerrors++; } /* * Keep processing entries in the callback list until * we come around and it is empty. We need to * atomically see that the list is empty and change the * state to DIRTY so that we don't miss any more * callbacks being added. */ spin_lock(&iclog->ic_callback_lock); cb = iclog->ic_callback; while (cb) { iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_callback = NULL; spin_unlock(&iclog->ic_callback_lock); /* perform callbacks in the order given */ for (; cb; cb = cb_next) { cb_next = cb->cb_next; cb->cb_func(cb->cb_arg, aborted); } spin_lock(&iclog->ic_callback_lock); cb = iclog->ic_callback; } loopdidcallbacks++; funcdidcallbacks++; spin_lock(&log->l_icloglock); ASSERT(iclog->ic_callback == NULL); spin_unlock(&iclog->ic_callback_lock); if (!(iclog->ic_state & XLOG_STATE_IOERROR)) iclog->ic_state = XLOG_STATE_DIRTY; /* * Transition from DIRTY to ACTIVE if applicable. * NOP if STATE_IOERROR. */ xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ sv_broadcast(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); if (repeats > 5000) { flushcnt += repeats; repeats = 0; xfs_fs_cmn_err(CE_WARN, log->l_mp, "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } } while (!ioerrors && loopdidcallbacks); /* * make one last gasp attempt to see if iclogs are being left in * limbo.. */ #ifdef DEBUG if (funcdidcallbacks) { first_iclog = iclog = log->l_iclog; do { ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); /* * Terminate the loop if iclogs are found in states * which will cause other threads to clean up iclogs. * * SYNCING - i/o completion will go through logs * DONE_SYNC - interrupt thread should be waiting for * l_icloglock * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_DONE_SYNC || iclog->ic_state == XLOG_STATE_IOERROR ) break; iclog = iclog->ic_next; } while (first_iclog != iclog); } #endif if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) wake = 1; spin_unlock(&log->l_icloglock); if (wake) sv_broadcast(&log->l_flush_wait); } /* * Finish transitioning this iclog to the dirty state. * * Make sure that we completely execute this routine only when this is * the last call to the iclog. There is a good chance that iclog flushes, * when we reach the end of the physical log, get turned into 2 separate * calls to bwrite. Hence, one iclog flush could generate two calls to this * routine. By using the reference count bwritecnt, we guarantee that only * the second completion goes through. * * Callbacks could take time, so they are done outside the scope of the * global state machine log lock. */ STATIC void xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { xlog_t *log = iclog->ic_log; spin_lock(&log->l_icloglock); ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_IOERROR); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); /* * If we got an error, either on the first buffer, or in the case of * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, * and none should ever be attempted to be written to disk * again. */ if (iclog->ic_state != XLOG_STATE_IOERROR) { if (--iclog->ic_bwritecnt == 1) { spin_unlock(&log->l_icloglock); return; } iclog->ic_state = XLOG_STATE_DONE_SYNC; } /* * Someone could be sleeping prior to writing out the next * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ sv_broadcast(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must * sleep. We wait on the flush queue on the head iclog as that should be * the first iclog to complete flushing. Hence if all iclogs are syncing, * we will wait here and all new writes will sleep until a sync completes. * * The in-core logs are used in a circular fashion. They are not used * out-of-order even when an iclog past the head is free. * * return: * * log_offset where xlog_write() can start writing into the in-core * log's data space. * * in-core log pointer to which xlog_write() should write. * * boolean indicating this is a continued write to an in-core log. * If this is the last write, then the in-core log's offset field * needs to be incremented, depending on the amount of data which * is copied. */ STATIC int xlog_state_get_iclog_space(xlog_t *log, int len, xlog_in_core_t **iclogp, xlog_ticket_t *ticket, int *continued_write, int *logoffsetp) { int log_offset; xlog_rec_header_t *head; xlog_in_core_t *iclog; int error; restart: spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } iclog = log->l_iclog; if (iclog->ic_state != XLOG_STATE_ACTIVE) { xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); XFS_STATS_INC(xs_log_noiclogs); /* Wait for log writes to have flushed */ sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); goto restart; } head = &iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; /* On the 1st write to an iclog, figure out lsn. This works * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are * committing to. If the offset is set, that's how many blocks * must be written. */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; xlog_tic_add_region(ticket, log->l_iclog_hsize, XLOG_REG_TYPE_LRHEADER); head->h_cycle = cpu_to_be32(log->l_curr_cycle); head->h_lsn = cpu_to_be64( xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); ASSERT(log->l_curr_block >= 0); } /* If there is enough room to write everything, then do it. Otherwise, * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC * bit is on, so this will get flushed out. Don't update ic_offset * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's * can fit into remaining data section. */ if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { xlog_state_switch_iclogs(log, iclog, iclog->ic_size); /* * If I'm the only one writing to this iclog, sync it to disk. * We need to do an atomic compare and decrement here to avoid * racing with concurrent atomic_dec_and_lock() calls in * xlog_state_release_iclog() when there is more than one * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { /* we are the only one */ spin_unlock(&log->l_icloglock); error = xlog_state_release_iclog(log, iclog); if (error) return error; } else { spin_unlock(&log->l_icloglock); } goto restart; } /* Do we have enough room to write the full amount in the remainder * of this iclog? Or must we continue a write on the next iclog and * mark this iclog as completely taken? In the case where we switch * iclogs (to mark it taken), this particular iclog will release/sync * to disk in xlog_write(). */ if (len <= iclog->ic_size - iclog->ic_offset) { *continued_write = 0; iclog->ic_offset += len; } else { *continued_write = 1; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); } *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); spin_unlock(&log->l_icloglock); *logoffsetp = log_offset; return 0; } /* xlog_state_get_iclog_space */ /* * Atomically get the log space required for a log ticket. * * Once a ticket gets put onto the reserveq, it will only return after * the needed reservation is satisfied. */ STATIC int xlog_grant_log_space(xlog_t *log, xlog_ticket_t *tic) { int free_bytes; int need_bytes; #ifdef DEBUG xfs_lsn_t tail_lsn; #endif #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("grant Recovery problem"); #endif /* Is there space or do we need to sleep? */ spin_lock(&log->l_grant_lock); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); /* something is already sleeping; insert new transaction at end */ if (log->l_reserve_headq) { xlog_ins_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 1"); /* * Gotta check this before going to sleep, while we're * holding the grant lock. */ if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... */ xlog_trace_loggrant(log, tic, "xlog_grant_log_space: wake 1"); spin_lock(&log->l_grant_lock); } if (tic->t_flags & XFS_LOG_PERM_RESERV) need_bytes = tic->t_unit_res*tic->t_ocnt; else need_bytes = tic->t_unit_res; redo: if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, log->l_grant_reserve_bytes); if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); spin_unlock(&log->l_grant_lock); xlog_grant_push_ail(log->l_mp, need_bytes); spin_lock(&log->l_grant_lock); XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; xlog_trace_loggrant(log, tic, "xlog_grant_log_space: wake 2"); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); /* we've got enough space */ xlog_grant_add_space(log, need_bytes); #ifdef DEBUG tail_lsn = log->l_tail_lsn; /* * Check to make sure the grant write head didn't just over lap the * tail. If the cycles are the same, we can't be overlapping. * Otherwise, make sure that the cycles differ by exactly one and * check the byte count. */ if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); } #endif xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); xlog_verify_grant_head(log, 1); spin_unlock(&log->l_grant_lock); return 0; error_return: if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); /* * If we are failing, make sure the ticket doesn't have any * current reservations. We don't want to add this back when * the ticket/transaction gets cancelled. */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_grant_log_space */ /* * Replenish the byte reservation required by moving the grant write head. * * */ STATIC int xlog_regrant_write_log_space(xlog_t *log, xlog_ticket_t *tic) { int free_bytes, need_bytes; xlog_ticket_t *ntic; #ifdef DEBUG xfs_lsn_t tail_lsn; #endif tic->t_curr_res = tic->t_unit_res; xlog_tic_reset_res(tic); if (tic->t_cnt > 0) return 0; #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("regrant Recovery problem"); #endif spin_lock(&log->l_grant_lock); xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; /* If there are other waiters on the queue then give them a * chance at logspace before us. Wake up the first waiters, * if we do not wake up all the waiters then go to sleep waiting * for more free space, otherwise try to get some space for * this transaction. */ need_bytes = tic->t_unit_res; if ((ntic = log->l_write_headq)) { free_bytes = xlog_space_left(log, log->l_grant_write_cycle, log->l_grant_write_bytes); do { ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); if (free_bytes < ntic->t_unit_res) break; free_bytes -= ntic->t_unit_res; sv_signal(&ntic->t_wait); ntic = ntic->t_next; } while (ntic != log->l_write_headq); if (ntic != log->l_write_headq) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); spin_unlock(&log->l_grant_lock); xlog_grant_push_ail(log->l_mp, need_bytes); spin_lock(&log->l_grant_lock); XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already * off the queue */ spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 1"); } } redo: if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; free_bytes = xlog_space_left(log, log->l_grant_write_cycle, log->l_grant_write_bytes); if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); spin_unlock(&log->l_grant_lock); xlog_grant_push_ail(log->l_mp, need_bytes); spin_lock(&log->l_grant_lock); XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 2"); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_write_headq, tic); /* we've got enough space */ xlog_grant_add_space_write(log, need_bytes); #ifdef DEBUG tail_lsn = log->l_tail_lsn; if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); } #endif xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); xlog_verify_grant_head(log, 1); spin_unlock(&log->l_grant_lock); return 0; error_return: if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); /* * If we are failing, make sure the ticket doesn't have any * current reservations. We don't want to add this back when * the ticket/transaction gets cancelled. */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_regrant_write_log_space */ /* The first cnt-1 times through here we don't need to * move the grant write head because the permanent * reservation has reserved cnt times the unit amount. * Release part of current permanent unit reservation and * reset current reservation to be one units worth. Also * move grant reservation head forward. */ STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket) { xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: enter"); if (ticket->t_cnt > 0) ticket->t_cnt--; spin_lock(&log->l_grant_lock); xlog_grant_sub_space(log, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: sub current res"); xlog_verify_grant_head(log, 1); /* just return if we still have some of the pre-reserved space */ if (ticket->t_cnt > 0) { spin_unlock(&log->l_grant_lock); return; } xlog_grant_add_space_reserve(log, ticket->t_unit_res); xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: exit"); xlog_verify_grant_head(log, 0); spin_unlock(&log->l_grant_lock); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); } /* xlog_regrant_reserve_log_space */ /* * Give back the space left from a reservation. * * All the information we need to make a correct determination of space left * is present. For non-permanent reservations, things are quite easy. The * count should have been decremented to zero. We only need to deal with the * space remaining in the current reservation part of the ticket. If the * ticket contains a permanent reservation, there may be left over space which * needs to be released. A count of N means that N-1 refills of the current * reservation can be done before we need to ask for more space. The first * one goes to fill up the first current reservation. Once we run out of * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket) { if (ticket->t_cnt > 0) ticket->t_cnt--; spin_lock(&log->l_grant_lock); xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); xlog_grant_sub_space(log, ticket->t_curr_res); xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); /* If this is a permanent reservation ticket, we may be able to free * up more space based on the remaining count. */ if (ticket->t_cnt > 0) { ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); } xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); xlog_verify_grant_head(log, 1); spin_unlock(&log->l_grant_lock); xfs_log_move_tail(log->l_mp, 1); } /* xlog_ungrant_log_space */ /* * Flush iclog to disk if this is the last reference to the given iclog and * the WANT_SYNC bit is set. * * When this function is entered, the iclog is not necessarily in the * WANT_SYNC state. It may be sitting around waiting to get filled. * * */ STATIC int xlog_state_release_iclog( xlog_t *log, xlog_in_core_t *iclog) { int sync = 0; /* do we sync? */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); ASSERT(atomic_read(&iclog->ic_refcnt) > 0); if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) return 0; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { /* update tail before writing to iclog */ xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); /* cycle incremented when incrementing curr_block */ } spin_unlock(&log->l_icloglock); /* * We let the log lock go, so it's possible that we hit a log I/O * error or some other SHUTDOWN condition that marks the iclog * as XLOG_STATE_IOERROR before the bwrite. However, we know that * this iclog has consistent data, so we ignore IOERROR * flags after this point. */ if (sync) return xlog_sync(log, iclog); return 0; } /* xlog_state_release_iclog */ /* * This routine will mark the current iclog in the ring as WANT_SYNC * and move the current iclog pointer to the next iclog in the ring. * When this routine is called from xlog_state_get_iclog_space(), the * exact size of the iclog has not yet been determined. All we know is * that every data block. We have run out of space in this log record. */ STATIC void xlog_state_switch_iclogs(xlog_t *log, xlog_in_core_t *iclog, int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; /* roll log?: ic_offset changed later */ log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); /* Round up to next log-sunit */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); } if (log->l_curr_block >= log->l_logBBsize) { log->l_curr_cycle++; if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) log->l_curr_cycle++; log->l_curr_block -= log->l_logBBsize; ASSERT(log->l_curr_block >= 0); } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; } /* xlog_state_switch_iclogs */ /* * Write out all data in the in-core log as of this exact moment in time. * * Data may be written to the in-core log during this call. However, * we don't guarantee this data will be written out. A change from past * implementation means this routine will *not* write out zero length LRs. * * Basically, we try and perform an intelligent scan of the in-core logs. * If we determine there is no flushable data, we just return. There is no * flushable data if: * * 1. the current iclog is active and has no data; the previous iclog * is in the active or dirty state. * 2. the current iclog is drity, and the previous iclog is in the * active or dirty state. * * We may sleep if: * * 1. the current iclog is not in the active nor dirty state. * 2. the current iclog dirty, and the previous iclog is not in the * active nor dirty state. * 3. the current iclog is active, and there is another thread writing * to this particular iclog. * 4. a) the current iclog is active and has no other writers * b) when we return from flushing out this iclog, it is still * not in the active nor dirty state. */ STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) { xlog_in_core_t *iclog; xfs_lsn_t lsn; spin_lock(&log->l_icloglock); iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } /* If the head iclog is not active nor dirty, we just attach * ourselves to the head and go to sleep. */ if (iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY) { /* * If the head is dirty or (active and empty), then * we need to look at the previous iclog. If the previous * iclog is active or dirty we are done. There is nothing * to sync out. Otherwise, we attach ourselves to the * previous iclog and go to sleep. */ if (iclog->ic_state == XLOG_STATE_DIRTY || (atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { iclog = iclog->ic_prev; if (iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY) goto no_sleep; else goto maybe_sleep; } else { if (atomic_read(&iclog->ic_refcnt) == 0) { /* We are the only one with access to this * iclog. Flush it out now. There should * be a roundoff of zero to show that someone * has already taken care of the roundoff from * the previous sync. */ atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); *log_flushed = 1; spin_lock(&log->l_icloglock); if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) goto maybe_sleep; else goto no_sleep; } else { /* Someone else is writing to this iclog. * Use its call to flush out the data. However, * the other thread may not force out this LR, * so we mark it WANT_SYNC. */ xlog_state_switch_iclogs(log, iclog, 0); goto maybe_sleep; } } } /* By the time we come around again, the iclog could've been filled * which would give it another lsn. If we have a new lsn, just * return because the relevant data has been flushed. */ maybe_sleep: if (flags & XFS_LOG_SYNC) { /* * We must check if we're shutting down here, before * we wait, while we're holding the l_icloglock. * Then we check again after waking up, in case our * sleep was disturbed by a bad news. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO * and the memory read should be atomic. */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); *log_flushed = 1; } else { no_sleep: spin_unlock(&log->l_icloglock); } return 0; } /* xlog_state_sync_all */ /* * Used by code which implements synchronous log forces. * * Find in-core log with lsn. * If it is in the DIRTY state, just return. * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC * state and go to sleep or return. * If it is in any other state, go to sleep or return. * * If filesystem activity goes to zero, the iclog will get flushed only by * bdflush(). */ STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags, int *log_flushed) { xlog_in_core_t *iclog; int already_slept = 0; try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } do { if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { iclog = iclog->ic_next; continue; } if (iclog->ic_state == XLOG_STATE_DIRTY) { spin_unlock(&log->l_icloglock); return 0; } if (iclog->ic_state == XLOG_STATE_ACTIVE) { /* * We sleep here if we haven't already slept (e.g. * this is the first time we've looked at the correct * iclog buf) and the buffer before us is going to * be sync'ed. The reason for this is that if we * are doing sync transactions here, by waiting for * the previous I/O to complete, we can allow a few * more transactions into this iclog before we close * it down. * * Otherwise, we mark the buffer WANT_SYNC, and bump * up the refcnt so we can release the log (which drops * the ref count). The state switch keeps new transaction * commits from using this buffer. When the current commits * finish writing into the buffer, the refcount will drop to * zero and the buffer will go out then. */ if (!already_slept && (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); XFS_STATS_INC(xs_log_force_sleep); sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, &log->l_icloglock, s); *log_flushed = 1; already_slept = 1; goto try_again; } else { atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); *log_flushed = 1; spin_lock(&log->l_icloglock); } } if ((flags & XFS_LOG_SYNC) && /* sleep */ !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { /* * Don't wait on completion if we know that we've * gotten a log write error. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO * and the memory read should be atomic. */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } return 0; } while (iclog != log->l_iclog); spin_unlock(&log->l_icloglock); return 0; } /* xlog_state_sync */ /* * Called when we want to mark the current iclog as being ready to sync to * disk. */ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); } else { ASSERT(iclog->ic_state & (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); } } /***************************************************************************** * * TICKET functions * ***************************************************************************** */ /* * Free a used ticket when its refcount falls to zero. */ void xfs_log_ticket_put( xlog_ticket_t *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) { sv_destroy(&ticket->t_wait); kmem_zone_free(xfs_log_ticket_zone, ticket); } } xlog_ticket_t * xfs_log_ticket_get( xlog_ticket_t *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); atomic_inc(&ticket->t_ref); return ticket; } /* * Allocate and initialise a new log ticket. */ STATIC xlog_ticket_t * xlog_ticket_alloc(xlog_t *log, int unit_bytes, int cnt, char client, uint xflags) { xlog_ticket_t *tic; uint num_headers; tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); if (!tic) return NULL; /* * Permanent reservations have up to 'cnt'-1 active log operations * in the log. A unit in this case is the amount of space for one * of these log operations. Normal reservations have a cnt of 1 * and their unit amount is the total amount of space required. * * The following lines of code account for non-transaction data * which occupy space in the on-disk log. * * Normal form of a transaction is: * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph> * and then there are LR hdrs, split-recs and roundoff at end of syncs. * * We need to account for all the leadup data and trailer data * around the transaction data. * And then we need to account for the worst case in terms of using * more space. * The worst case will happen if: * - the placement of the transaction happens to be such that the * roundoff is at its maximum * - the transaction data is synced before the commit record is synced * i.e. <transaction-data><roundoff> | <commit-rec><roundoff> * Therefore the commit record is in its own Log Record. * This can happen as the commit record is called with its * own region to xlog_write(). * This then means that in the worst case, roundoff can happen for * the commit-rec as well. * The commit-rec is smaller than padding in this scenario and so it is * not added separately. */ /* for trans header */ unit_bytes += sizeof(xlog_op_header_t); unit_bytes += sizeof(xfs_trans_header_t); /* for start-rec */ unit_bytes += sizeof(xlog_op_header_t); /* for LR headers */ num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); unit_bytes += log->l_iclog_hsize * num_headers; /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; /* for split-recs - ophdrs added when data split over LRs */ unit_bytes += sizeof(xlog_op_header_t) * num_headers; /* for roundoff padding for transaction data and one for commit record */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; } else { /* BB roundoff */ unit_bytes += 2*BBSIZE; } atomic_set(&tic->t_ref, 1); tic->t_unit_res = unit_bytes; tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); return tic; } /****************************************************************************** * * Log debug routines * ****************************************************************************** */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of * one of the iclogs. This uses backup pointers stored in a different * part of the log in case we trash the log structure. */ void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr) { int i; int good_ptr = 0; for (i=0; i < log->l_iclog_bufs; i++) { if (ptr >= (__psint_t)log->l_iclog_bak[i] && ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) good_ptr++; } if (! good_ptr) xlog_panic("xlog_verify_dest_ptr: invalid ptr"); } /* xlog_verify_dest_ptr */ STATIC void xlog_verify_grant_head(xlog_t *log, int equals) { if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { if (equals) ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); else ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); } else { ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); } } /* xlog_verify_grant_head */ /* check if it will fit */ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, xfs_lsn_t tail_lsn) { int blocks; if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { blocks = log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) xlog_panic("xlog_verify_tail_lsn: ran out of log space"); } else { ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); if (BLOCK_LSN(tail_lsn) == log->l_prev_block) xlog_panic("xlog_verify_tail_lsn: tail wrapped"); blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; if (blocks < BTOBB(iclog->ic_offset) + 1) xlog_panic("xlog_verify_tail_lsn: ran out of log space"); } } /* xlog_verify_tail_lsn */ /* * Perform a number of checks on the iclog before writing to disk. * * 1. Make sure the iclogs are still circular * 2. Make sure we have a good magic number * 3. Make sure we don't have magic numbers in the data * 4. Check fields of each log operation header for: * A. Valid client identifier * B. tid ptr value falls in valid ptr space (user space code) * C. Length in log record header is correct according to the * individual operation headers within record. * 5. When a bwrite will occur within 5 blocks of the front of the physical * log, check the preceding blocks of the physical log to make sure all * the cycle numbers agree with the current cycle number. */ STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; xfs_caddr_t ptr; xfs_caddr_t base_ptr; __psint_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; /* check validity of iclog pointers */ spin_lock(&log->l_icloglock); icptr = log->l_iclog; for (i=0; i < log->l_iclog_bufs; i++) { if (icptr == NULL) xlog_panic("xlog_verify_iclog: invalid ptr"); icptr = icptr->ic_next; } if (icptr != log->l_iclog) xlog_panic("xlog_verify_iclog: corrupt iclog ring"); spin_unlock(&log->l_icloglock); /* check log magic numbers */ if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) xlog_panic("xlog_verify_iclog: invalid magic num"); ptr = (xfs_caddr_t) &iclog->ic_header; for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; ptr += BBSIZE) { if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) xlog_panic("xlog_verify_iclog: unexpected magic num"); } /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); ptr = iclog->ic_datap; base_ptr = ptr; ophead = (xlog_op_header_t *)ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { ophead = (xlog_op_header_t *)ptr; /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); if (syncing == B_FALSE || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); clientid = xlog_get_client_id( xhdr[j].hic_xheader.xh_cycle_data[k]); } else { clientid = xlog_get_client_id( iclog->ic_header.h_cycle_data[idx]); } } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) cmn_err(CE_WARN, "xlog_verify_iclog: " "invalid clientid %d op 0x%p offset 0x%lx", clientid, ophead, (unsigned long)field_offset); /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); if (syncing == B_FALSE || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - (__psint_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); } else { op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } ptr += sizeof(xlog_op_header_t) + op_len; } } /* xlog_verify_iclog */ #endif /* * Mark all iclogs IOERROR. l_icloglock is held by the caller. */ STATIC int xlog_state_ioerror( xlog_t *log) { xlog_in_core_t *iclog, *ic; iclog = log->l_iclog; if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { /* * Mark all the incore logs IOERROR. * From now on, no log flushes will result. */ ic = iclog; do { ic->ic_state = XLOG_STATE_IOERROR; ic = ic->ic_next; } while (ic != iclog); return 0; } /* * Return non-zero, if state transition has already happened. */ return 1; } /* * This is called from xfs_force_shutdown, when we're forcibly * shutting down the filesystem, typically because of an IO error. * Our main objectives here are to make sure that: * a. the filesystem gets marked 'SHUTDOWN' for all interested * parties to find out, 'atomically'. * b. those who're sleeping on log reservations, pinned objects and * other resources get woken up, and be told the bad news. * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. */ int xfs_log_force_umount( struct xfs_mount *mp, int logerror) { xlog_ticket_t *tic; xlog_t *log; int retval; int dummy; log = mp->m_log; /* * If this happens during log recovery, don't worry about * locking; the log isn't open for business yet. */ if (!log || log->l_flags & XLOG_ACTIVE_RECOVERY) { mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; if (mp->m_sb_bp) XFS_BUF_DONE(mp->m_sb_bp); return 0; } /* * Somebody could've already done the hard work for us. * No need to get locks for this. */ if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } retval = 0; /* * We must hold both the GRANT lock and the LOG lock, * before we mark the filesystem SHUTDOWN and wake * everybody up to tell the bad news. */ spin_lock(&log->l_icloglock); spin_lock(&log->l_grant_lock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; if (mp->m_sb_bp) XFS_BUF_DONE(mp->m_sb_bp); /* * This flag is sort of redundant because of the mount flag, but * it's good to maintain the separation between the log and the rest * of XFS. */ log->l_flags |= XLOG_IO_ERROR; /* * If we hit a log error, we want to mark all the iclogs IOERROR * while we're still holding the loglock. */ if (logerror) retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); /* * We don't want anybody waiting for log reservations * after this. That means we have to wake up everybody * queued up on reserve_headq as well as write_headq. * In addition, we make sure in xlog_{re}grant_log_space * that we don't enqueue anything once the SHUTDOWN flag * is set, and this action is protected by the GRANTLOCK. */ if ((tic = log->l_reserve_headq)) { do { sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_reserve_headq); } if ((tic = log->l_write_headq)) { do { sv_signal(&tic->t_wait); tic = tic->t_next; } while (tic != log->l_write_headq); } spin_unlock(&log->l_grant_lock); if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); /* * Force the incore logs to disk before shutting the * log down completely. */ xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); spin_lock(&log->l_icloglock); retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); } /* * Wake up everybody waiting on xfs_log_force. * Callback all log item committed functions as if the * log writes were completed. */ xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG { xlog_in_core_t *iclog; spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { ASSERT(iclog->ic_callback == 0); iclog = iclog->ic_next; } while (iclog != log->l_iclog); spin_unlock(&log->l_icloglock); } #endif /* return non-zero if log IOERROR transition had already happened */ return retval; } STATIC int xlog_iclogs_empty(xlog_t *log) { xlog_in_core_t *iclog; iclog = log->l_iclog; do { /* endianness does not matter here, zero is zero in * any language. */ if (iclog->ic_header.h_num_logops) return 0; iclog = iclog->ic_next; } while (iclog != log->l_iclog); return 1; }