litmus-rt.git - The LITMUS^RT kernel.

diff options

Diffstat (limited to 'drivers/isdn/hisax/asuscom.c')

-rw-r--r--

427

1 files changed, 427 insertions, 0 deletions


diff --git a/drivers/isdn/hisax/asuscom.c b/drivers/isdn/hisax/asuscom.c new file mode 100644 index 000000000000..7546e2e4a94e --- /dev/null +++ b/drivers/isdn/hisax/asuscom.c
@@ -0,0 +1,427 @@
	1	/* $Id: asuscom.c,v 1.14.2.4 2004/01/13 23:48:39 keil Exp $
	2	*
	3	* low level stuff for ASUSCOM NETWORK INC. ISDNLink cards
	4	*
	5	* Author Karsten Keil
	6	* Copyright by Karsten Keil <keil@isdn4linux.de>
	7	*
	8	* This software may be used and distributed according to the terms
	9	* of the GNU General Public License, incorporated herein by reference.
	10	*
	11	* Thanks to ASUSCOM NETWORK INC. Taiwan and Dynalink NL for information
	12	*
	13	*/
	14
	15	#include <linux/init.h>
	16	#include <linux/isapnp.h>
	17	#include "hisax.h"
	18	#include "isac.h"
	19	#include "ipac.h"
	20	#include "hscx.h"
	21	#include "isdnl1.h"
	22
	23	extern const char *CardType[];
	24
	25	const char *Asuscom_revision = "$Revision: 1.14.2.4 $";
	26
	27	#define byteout(addr,val) outb(val,addr)
	28	#define bytein(addr) inb(addr)
	29
	30	#define ASUS_ISAC 0
	31	#define ASUS_HSCX 1
	32	#define ASUS_ADR 2
	33	#define ASUS_CTRL_U7 3
	34	#define ASUS_CTRL_POTS 5
	35
	36	#define ASUS_IPAC_ALE 0
	37	#define ASUS_IPAC_DATA 1
	38
	39	#define ASUS_ISACHSCX 1
	40	#define ASUS_IPAC 2
	41
	42	/* CARD_ADR (Write) */
	43	#define ASUS_RESET 0x80 /* Bit 7 Reset-Leitung */
	44
	45	static inline u_char
	46	readreg(unsigned int ale, unsigned int adr, u_char off)
	47	{
	48	register u_char ret;
	49
	50	byteout(ale, off);
	51	ret = bytein(adr);
	52	return (ret);
	53	}
	54
	55	static inline void
	56	readfifo(unsigned int ale, unsigned int adr, u_char off, u_char * data, int size)
	57	{
	58	byteout(ale, off);
	59	insb(adr, data, size);
	60	}
	61
	62
	63	static inline void
	64	writereg(unsigned int ale, unsigned int adr, u_char off, u_char data)
	65	{
	66	byteout(ale, off);
	67	byteout(adr, data);
	68	}
	69
	70	static inline void
	71	writefifo(unsigned int ale, unsigned int adr, u_char off, u_char * data, int size)
	72	{
	73	byteout(ale, off);
	74	outsb(adr, data, size);
	75	}
	76
	77	/* Interface functions */
	78
	79	static u_char
	80	ReadISAC(struct IsdnCardState *cs, u_char offset)
	81	{
	82	return (readreg(cs->hw.asus.adr, cs->hw.asus.isac, offset));
	83	}
	84
	85	static void
	86	WriteISAC(struct IsdnCardState *cs, u_char offset, u_char value)
	87	{
	88	writereg(cs->hw.asus.adr, cs->hw.asus.isac, offset, value);
	89	}
	90
	91	static void
	92	ReadISACfifo(struct IsdnCardState cs, u_char data, int size)
	93	{
	94	readfifo(cs->hw.asus.adr, cs->hw.asus.isac, 0, data, size);
	95	}
	96
	97	static void
	98	WriteISACfifo(struct IsdnCardState cs, u_char data, int size)
	99	{
	100	writefifo(cs->hw.asus.adr, cs->hw.asus.isac, 0, data, size);
	101	}
	102
	103	static u_char
	104	ReadISAC_IPAC(struct IsdnCardState *cs, u_char offset)
	105	{
	106	return (readreg(cs->hw.asus.adr, cs->hw.asus.isac, offset\|0x80));
	107	}
	108
	109	static void
	110	WriteISAC_IPAC(struct IsdnCardState *cs, u_char offset, u_char value)
	111	{
	112	writereg(cs->hw.asus.adr, cs->hw.asus.isac, offset\|0x80, value);
	113	}
	114
	115	static void
	116	ReadISACfifo_IPAC(struct IsdnCardState cs, u_char data, int size)
	117	{
	118	readfifo(cs->hw.asus.adr, cs->hw.asus.isac, 0x80, data, size);
	119	}
	120
	121	static void
	122	WriteISACfifo_IPAC(struct IsdnCardState cs, u_char data, int size)
	123	{
	124	writefifo(cs->hw.asus.adr, cs->hw.asus.isac, 0x80, data, size);
	125	}
	126
	127	static u_char
	128	ReadHSCX(struct IsdnCardState *cs, int hscx, u_char offset)
	129	{
	130	return (readreg(cs->hw.asus.adr,
	131	cs->hw.asus.hscx, offset + (hscx ? 0x40 : 0)));
	132	}
	133
	134	static void
	135	WriteHSCX(struct IsdnCardState *cs, int hscx, u_char offset, u_char value)
	136	{
	137	writereg(cs->hw.asus.adr,
	138	cs->hw.asus.hscx, offset + (hscx ? 0x40 : 0), value);
	139	}
	140
	141	/*
	142	* fast interrupt HSCX stuff goes here
	143	*/
	144
	145	#define READHSCX(cs, nr, reg) readreg(cs->hw.asus.adr, \
	146	cs->hw.asus.hscx, reg + (nr ? 0x40 : 0))
	147	#define WRITEHSCX(cs, nr, reg, data) writereg(cs->hw.asus.adr, \
	148	cs->hw.asus.hscx, reg + (nr ? 0x40 : 0), data)
	149
	150	#define READHSCXFIFO(cs, nr, ptr, cnt) readfifo(cs->hw.asus.adr, \
	151	cs->hw.asus.hscx, (nr ? 0x40 : 0), ptr, cnt)
	152
	153	#define WRITEHSCXFIFO(cs, nr, ptr, cnt) writefifo(cs->hw.asus.adr, \
	154	cs->hw.asus.hscx, (nr ? 0x40 : 0), ptr, cnt)
	155
	156	#include "hscx_irq.c"
	157
	158	static irqreturn_t
	159	asuscom_interrupt(int intno, void dev_id, struct pt_regs regs)
	160	{
	161	struct IsdnCardState *cs = dev_id;
	162	u_char val;
	163	u_long flags;
	164
	165	spin_lock_irqsave(&cs->lock, flags);
	166	val = readreg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_ISTA + 0x40);
	167	Start_HSCX:
	168	if (val)
	169	hscx_int_main(cs, val);
	170	val = readreg(cs->hw.asus.adr, cs->hw.asus.isac, ISAC_ISTA);
	171	Start_ISAC:
	172	if (val)
	173	isac_interrupt(cs, val);
	174	val = readreg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_ISTA + 0x40);
	175	if (val) {
	176	if (cs->debug & L1_DEB_HSCX)
	177	debugl1(cs, "HSCX IntStat after IntRoutine");
	178	goto Start_HSCX;
	179	}
	180	val = readreg(cs->hw.asus.adr, cs->hw.asus.isac, ISAC_ISTA);
	181	if (val) {
	182	if (cs->debug & L1_DEB_ISAC)
	183	debugl1(cs, "ISAC IntStat after IntRoutine");
	184	goto Start_ISAC;
	185	}
	186	writereg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_MASK, 0xFF);
	187	writereg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_MASK + 0x40, 0xFF);
	188	writereg(cs->hw.asus.adr, cs->hw.asus.isac, ISAC_MASK, 0xFF);
	189	writereg(cs->hw.asus.adr, cs->hw.asus.isac, ISAC_MASK, 0x0);
	190	writereg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_MASK, 0x0);
	191	writereg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_MASK + 0x40, 0x0);
	192	spin_unlock_irqrestore(&cs->lock, flags);
	193	return IRQ_HANDLED;
	194	}
	195
	196	static irqreturn_t
	197	asuscom_interrupt_ipac(int intno, void dev_id, struct pt_regs regs)
	198	{
	199	struct IsdnCardState *cs = dev_id;
	200	u_char ista, val, icnt = 5;
	201	u_long flags;
	202
	203	spin_lock_irqsave(&cs->lock, flags);
	204	ista = readreg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_ISTA);
	205	Start_IPAC:
	206	if (cs->debug & L1_DEB_IPAC)
	207	debugl1(cs, "IPAC ISTA %02X", ista);
	208	if (ista & 0x0f) {
	209	val = readreg(cs->hw.asus.adr, cs->hw.asus.hscx, HSCX_ISTA + 0x40);
	210	if (ista & 0x01)
	211	val \|= 0x01;
	212	if (ista & 0x04)
	213	val \|= 0x02;
	214	if (ista & 0x08)
	215	val \|= 0x04;
	216	if (val)
	217	hscx_int_main(cs, val);
	218	}
	219	if (ista & 0x20) {
	220	val = 0xfe & readreg(cs->hw.asus.adr, cs->hw.asus.isac, ISAC_ISTA \| 0x80);
	221	if (val) {
	222	isac_interrupt(cs, val);
	223	}
	224	}
	225	if (ista & 0x10) {
	226	val = 0x01;
	227	isac_interrupt(cs, val);
	228	}
	229	ista = readreg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_ISTA);
	230	if ((ista & 0x3f) && icnt) {
	231	icnt--;
	232	goto Start_IPAC;
	233	}
	234	if (!icnt)
	235	printk(KERN_WARNING "ASUS IRQ LOOP\n");
	236	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_MASK, 0xFF);
	237	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_MASK, 0xC0);
	238	spin_unlock_irqrestore(&cs->lock, flags);
	239	return IRQ_HANDLED;
	240	}
	241
	242	void
	243	release_io_asuscom(struct IsdnCardState *cs)
	244	{
	245	int bytecnt = 8;
	246
	247	if (cs->hw.asus.cfg_reg)
	248	release_region(cs->hw.asus.cfg_reg, bytecnt);
	249	}
	250
	251	static void
	252	reset_asuscom(struct IsdnCardState *cs)
	253	{
	254	if (cs->subtyp == ASUS_IPAC)
	255	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_POTA2, 0x20);
	256	else
	257	byteout(cs->hw.asus.adr, ASUS_RESET); /* Reset On */
	258	mdelay(10);
	259	if (cs->subtyp == ASUS_IPAC)
	260	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_POTA2, 0x0);
	261	else
	262	byteout(cs->hw.asus.adr, 0); /* Reset Off */
	263	mdelay(10);
	264	if (cs->subtyp == ASUS_IPAC) {
	265	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_CONF, 0x0);
	266	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_ACFG, 0xff);
	267	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_AOE, 0x0);
	268	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_MASK, 0xc0);
	269	writereg(cs->hw.asus.adr, cs->hw.asus.isac, IPAC_PCFG, 0x12);
	270	}
	271	}
	272
	273	static int
	274	Asus_card_msg(struct IsdnCardState cs, int mt, void arg)
	275	{
	276	u_long flags;
	277
	278	switch (mt) {
	279	case CARD_RESET:
	280	spin_lock_irqsave(&cs->lock, flags);
	281	reset_asuscom(cs);
	282	spin_unlock_irqrestore(&cs->lock, flags);
	283	return(0);
	284	case CARD_RELEASE:
	285	release_io_asuscom(cs);
	286	return(0);
	287	case CARD_INIT:
	288	spin_lock_irqsave(&cs->lock, flags);
	289	cs->debug \|= L1_DEB_IPAC;
	290	inithscxisac(cs, 3);
	291	spin_unlock_irqrestore(&cs->lock, flags);
	292	return(0);
	293	case CARD_TEST:
	294	return(0);
	295	}
	296	return(0);
	297	}
	298
	299	#ifdef __ISAPNP__
	300	static struct isapnp_device_id asus_ids[] __initdata = {
	301	{ ISAPNP_VENDOR('A', 'S', 'U'), ISAPNP_FUNCTION(0x1688),
	302	ISAPNP_VENDOR('A', 'S', 'U'), ISAPNP_FUNCTION(0x1688),
	303	(unsigned long) "Asus1688 PnP" },
	304	{ ISAPNP_VENDOR('A', 'S', 'U'), ISAPNP_FUNCTION(0x1690),
	305	ISAPNP_VENDOR('A', 'S', 'U'), ISAPNP_FUNCTION(0x1690),
	306	(unsigned long) "Asus1690 PnP" },
	307	{ ISAPNP_VENDOR('S', 'I', 'E'), ISAPNP_FUNCTION(0x0020),
	308	ISAPNP_VENDOR('S', 'I', 'E'), ISAPNP_FUNCTION(0x0020),
	309	(unsigned long) "Isurf2 PnP" },
	310	{ ISAPNP_VENDOR('E', 'L', 'F'), ISAPNP_FUNCTION(0x0000),
	311	ISAPNP_VENDOR('E', 'L', 'F'), ISAPNP_FUNCTION(0x0000),
	312	(unsigned long) "Iscas TE320" },
	313	{ 0, }
	314	};
	315
	316	static struct isapnp_device_id *ipid __initdata = &asus_ids[0];
	317	static struct pnp_card *pnp_c __devinitdata = NULL;
	318	#endif
	319
	320	int __init
	321	setup_asuscom(struct IsdnCard *card)
	322	{
	323	int bytecnt;
	324	struct IsdnCardState *cs = card->cs;
	325	u_char val;
	326	char tmp[64];
	327
	328	strcpy(tmp, Asuscom_revision);
	329	printk(KERN_INFO "HiSax: Asuscom ISDNLink driver Rev. %s\n", HiSax_getrev(tmp));
	330	if (cs->typ != ISDN_CTYPE_ASUSCOM)
	331	return (0);
	332	#ifdef __ISAPNP__
	333	if (!card->para[1] && isapnp_present()) {
	334	struct pnp_dev *pnp_d;
	335	while(ipid->card_vendor) {
	336	if ((pnp_c = pnp_find_card(ipid->card_vendor,
	337	ipid->card_device, pnp_c))) {
	338	pnp_d = NULL;
	339	if ((pnp_d = pnp_find_dev(pnp_c,
	340	ipid->vendor, ipid->function, pnp_d))) {
	341	int err;
	342
	343	printk(KERN_INFO "HiSax: %s detected\n",
	344	(char *)ipid->driver_data);
	345	pnp_disable_dev(pnp_d);
	346	err = pnp_activate_dev(pnp_d);
	347	if (err<0) {
	348	printk(KERN_WARNING "%s: pnp_activate_dev ret(%d)\n",
	349	__FUNCTION__, err);
	350	return(0);
	351	}
	352	card->para[1] = pnp_port_start(pnp_d, 0);
	353	card->para[0] = pnp_irq(pnp_d, 0);
	354	if (!card->para[0] \|\| !card->para[1]) {
	355	printk(KERN_ERR "AsusPnP:some resources are missing %ld/%lx\n",
	356	card->para[0], card->para[1]);
	357	pnp_disable_dev(pnp_d);
	358	return(0);
	359	}
	360	break;
	361	} else {
	362	printk(KERN_ERR "AsusPnP: PnP error card found, no device\n");
	363	}
	364	}
	365	ipid++;
	366	pnp_c = NULL;
	367	}
	368	if (!ipid->card_vendor) {
	369	printk(KERN_INFO "AsusPnP: no ISAPnP card found\n");
	370	return(0);
	371	}
	372	}
	373	#endif
	374	bytecnt = 8;
	375	cs->hw.asus.cfg_reg = card->para[1];
	376	cs->irq = card->para[0];
	377	if (!request_region(cs->hw.asus.cfg_reg, bytecnt, "asuscom isdn")) {
	378	printk(KERN_WARNING
	379	"HiSax: %s config port %x-%x already in use\n",
	380	CardType[card->typ],
	381	cs->hw.asus.cfg_reg,
	382	cs->hw.asus.cfg_reg + bytecnt);
	383	return (0);
	384	}
	385	printk(KERN_INFO "ISDNLink: defined at 0x%x IRQ %d\n",
	386	cs->hw.asus.cfg_reg, cs->irq);
	387	setup_isac(cs);
	388	cs->BC_Read_Reg = &ReadHSCX;
	389	cs->BC_Write_Reg = &WriteHSCX;
	390	cs->BC_Send_Data = &hscx_fill_fifo;
	391	cs->cardmsg = &Asus_card_msg;
	392	val = readreg(cs->hw.asus.cfg_reg + ASUS_IPAC_ALE,
	393	cs->hw.asus.cfg_reg + ASUS_IPAC_DATA, IPAC_ID);
	394	if ((val == 1) \|\| (val == 2)) {
	395	cs->subtyp = ASUS_IPAC;
	396	cs->hw.asus.adr = cs->hw.asus.cfg_reg + ASUS_IPAC_ALE;
	397	cs->hw.asus.isac = cs->hw.asus.cfg_reg + ASUS_IPAC_DATA;
	398	cs->hw.asus.hscx = cs->hw.asus.cfg_reg + ASUS_IPAC_DATA;
	399	test_and_set_bit(HW_IPAC, &cs->HW_Flags);
	400	cs->readisac = &ReadISAC_IPAC;
	401	cs->writeisac = &WriteISAC_IPAC;
	402	cs->readisacfifo = &ReadISACfifo_IPAC;
	403	cs->writeisacfifo = &WriteISACfifo_IPAC;
	404	cs->irq_func = &asuscom_interrupt_ipac;
	405	printk(KERN_INFO "Asus: IPAC version %x\n", val);
	406	} else {
	407	cs->subtyp = ASUS_ISACHSCX;
	408	cs->hw.asus.adr = cs->hw.asus.cfg_reg + ASUS_ADR;
	409	cs->hw.asus.isac = cs->hw.asus.cfg_reg + ASUS_ISAC;
	410	cs->hw.asus.hscx = cs->hw.asus.cfg_reg + ASUS_HSCX;
	411	cs->hw.asus.u7 = cs->hw.asus.cfg_reg + ASUS_CTRL_U7;
	412	cs->hw.asus.pots = cs->hw.asus.cfg_reg + ASUS_CTRL_POTS;
	413	cs->readisac = &ReadISAC;
	414	cs->writeisac = &WriteISAC;
	415	cs->readisacfifo = &ReadISACfifo;
	416	cs->writeisacfifo = &WriteISACfifo;
	417	cs->irq_func = &asuscom_interrupt;
	418	ISACVersion(cs, "ISDNLink:");
	419	if (HscxVersion(cs, "ISDNLink:")) {
	420	printk(KERN_WARNING
	421	"ISDNLink: wrong HSCX versions check IO address\n");
	422	release_io_asuscom(cs);
	423	return (0);
	424	}
	425	}
	426	return (1);
	427	}

/* * Copyright (C) 2008 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" #include "tree-log.h" #include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: * * LOG_INODE_ALL means to log everything * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay */ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 /* * directory trouble cases * * 1) on rename or unlink, if the inode being unlinked isn't in the fsync * log, we must force a full commit before doing an fsync of the directory * where the unlink was done. * ---> record transid of last unlink/rename per directory * * mkdir foo/some_dir * normal commit * rename foo/some_dir foo2/some_dir * mkdir foo/some_dir * fsync foo/some_dir/some_file * * The fsync above will unlink the original some_dir without recording * it in its new location (foo2). After a crash, some_dir will be gone * unless the fsync of some_file forces a full commit * * 2) we must log any new names for any file or dir that is in the fsync * log. ---> check inode while renaming/linking. * * 2a) we must log any new names for any file or dir during rename * when the directory they are being removed from was logged. * ---> check inode and old parent dir during rename * * 2a is actually the more important variant. With the extra logging * a crash might unlink the old name without recreating the new one * * 3) after a crash, we must go through any directories with a link count * of zero and redo the rm -rf * * mkdir f1/foo * normal commit * rm -rf f1/foo * fsync(f1) * * The directory f1 was fully removed from the FS, but fsync was never * called on f1, only its parent dir. After a crash the rm -rf must * be replayed. This must be able to recurse down the entire * directory tree. The inode link count fixup code takes care of the * ugly details. */ /* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find * the second stage (1) is to make sure that all the inodes * we find in the log are created in the subvolume. * * The last stage is to deal with directories and links and extents * and all the other fun semantics */ #define LOG_WALK_PIN_ONLY 0 #define LOG_WALK_REPLAY_INODES 1 #define LOG_WALK_REPLAY_DIR_INDEX 2 #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); /* * tree logging is a special write ahead log used to make sure that * fsyncs and O_SYNCs can happen without doing full tree commits. * * Full tree commits are expensive because they require commonly * modified blocks to be recowed, creating many dirty pages in the * extent tree an 4x-6x higher write load than ext3. * * Instead of doing a tree commit on every fsync, we use the * key ranges and transaction ids to find items for a given file or directory * that have changed in this transaction. Those items are copied into * a special tree (one per subvolume root), that tree is written to disk * and then the fsync is considered complete. * * After a crash, items are copied out of the log-tree back into the * subvolume tree. Any file data extents found are recorded in the extent * allocation tree, and the log-tree freed. * * The log tree is read three times, once to pin down all the extents it is * using in ram and once, once to create all the inodes logged in the tree * and once to do all the other items. */ /* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; mutex_lock(&root->log_mutex); if (root->log_root) { if (!root->log_start_pid) { root->log_start_pid = current->pid; root->log_multiple_pids = false; } else if (root->log_start_pid != current->pid) { root->log_multiple_pids = true; } atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; } ret = 0; mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) ret = btrfs_init_log_root_tree(trans, root->fs_info); mutex_unlock(&root->fs_info->tree_log_mutex); if (ret) goto out; if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) goto out; } root->log_multiple_pids = false; root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); out: mutex_unlock(&root->log_mutex); return ret; } /* * returns 0 if there was a log transaction running and we were able * to join, or returns -ENOENT if there were not transactions * in progress */ static int join_running_log_trans(struct btrfs_root *root) { int ret = -ENOENT; smp_mb(); if (!root->log_root) return -ENOENT; mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); return ret; } /* * This either makes the current running log transaction wait * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ int btrfs_pin_log_trans(struct btrfs_root *root) { int ret = -ENOENT; mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return ret; } /* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } } /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part * of the log tree processing we are currently doing. The others * are state fields used for that specific part */ struct walk_control { /* should we free the extent on disk when done? This is used * at transaction commit time while freeing a log tree */ int free; /* should we write out the extent buffer? This is used * while flushing the log tree to disk during a sync */ int write; /* should we wait for the extent buffer io to finish? Also used * while flushing the log tree to disk for a sync */ int wait; /* pin only walk, we record which extents on disk belong to the * log trees */ int pin; /* what stage of the replay code we're currently in */ int stage; /* the root we are currently replaying */ struct btrfs_root *replay_dest; /* the trans handle for the current replay */ struct btrfs_trans_handle *trans; /* the function that gets used to process blocks we find in the * tree. Note the extent_buffer might not be up to date when it is * passed in, and it must be checked or read if you need the data * inside it */ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen); }; /* * process_func used to pin down extents, write them or wait on them */ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { int ret = 0; /* * If this fs is mixed then we need to be able to process the leaves to * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { ret = btrfs_read_buffer(eb, gen); if (ret) return ret; } if (wc->pin) ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) ret = btrfs_exclude_logged_extents(log, eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) btrfs_wait_tree_block_writeback(eb); } return ret; } /* * Item overwrite used by replay and tree logging. eb, slot and key all refer * to the src data we are copying out. * * root is the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and * will be released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. * If it is too large, it is truncated. * * If the key isn't in the destination yet, a new item is inserted. */ static noinline int overwrite_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { int ret; u32 item_size; u64 saved_i_size = 0; int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; int overwrite_root = 0; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; item_size = btrfs_item_size_nr(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); /* look for the key in the destination tree */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { char *src_copy; char *dst_copy; u32 dst_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (dst_size != item_size) goto insert; if (item_size == 0) { btrfs_release_path(path); return 0; } dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); if (!dst_copy || !src_copy) { btrfs_release_path(path); kfree(dst_copy); kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, item_size); ret = memcmp(dst_copy, src_copy, item_size); kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves * us from cowing blocks in the destination tree and doing * extra writes that may not have been done by a previous * sync */ if (ret == 0) { btrfs_release_path(path); return 0; } /* * We need to load the old nbytes into the inode so when we * replay the extents we've logged we get the right nbytes. */ if (inode_item) { struct btrfs_inode_item *item; u64 nbytes; u32 mode; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); nbytes = btrfs_inode_nbytes(path->nodes[0], item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); /* * If this is a directory we need to reset the i_size to * 0 so that we can set it up properly when replaying * the rest of the items in this log. */ mode = btrfs_inode_mode(eb, item); if (S_ISDIR(mode)) btrfs_set_inode_size(eb, item, 0); } } else if (inode_item) { struct btrfs_inode_item *item; u32 mode; /* * New inode, set nbytes to 0 so that the nbytes comes out * properly when we replay the extents. */ item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, 0); /* * If this is a directory we need to reset the i_size to 0 so * that we can set it up properly when replaying the rest of * the items in this log. */ mode = btrfs_inode_mode(eb, item); if (S_ISDIR(mode)) btrfs_set_inode_size(eb, item, 0); } insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ ret = btrfs_insert_empty_item(trans, root, path, key, item_size); /* make sure any existing item is the correct size */ if (ret == -EEXIST) { u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) btrfs_truncate_item(root, path, item_size, 1); else if (found_size < item_size) btrfs_extend_item(root, path, item_size - found_size); } else if (ret) { return ret; } dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code * is just logging an inode to make sure it exists after recovery. * * Also, don't overwrite i_size on directories during replay. * log replay inserts and removes directory items based on the * state of the tree found in the subvolume, and i_size is modified * as it goes */ if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { struct btrfs_inode_item *src_item; struct btrfs_inode_item *dst_item; src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) goto no_copy; if (overwrite_root && S_ISDIR(btrfs_inode_mode(eb, src_item)) && S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(path->nodes[0], dst_item); } } copy_extent_buffer(path->nodes[0], eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; dst_item = (struct btrfs_inode_item *)dst_ptr; btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { btrfs_set_inode_generation(path->nodes[0], dst_item, trans->transid); } } no_copy: btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); return 0; } /* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { struct btrfs_key key; struct inode *inode; key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); if (IS_ERR(inode)) { inode = NULL; } else if (is_bad_inode(inode)) { iput(inode); inode = NULL; } return inode; } /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. * * extents in the log tree have not been allocated out of the extent * tree yet. So, this completes the allocation, taking a reference * as required if the extent already exists or creating a new extent * if it isn't in the extent allocation tree yet. * * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { int found_type; u64 extent_end; u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; struct inode *inode = NULL; unsigned long size; int ret = 0; item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { nbytes = btrfs_file_extent_num_bytes(eb, item); extent_end = start + nbytes; /* * We don't add to the inodes nbytes if we are prealloc or a * hole. */ if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, slot, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; goto out; } inode = read_one_inode(root, key->objectid); if (!inode) { ret = -EIO; goto out; } /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { struct btrfs_file_extent_item cmp1; struct btrfs_file_extent_item cmp2; struct btrfs_file_extent_item *existing; struct extent_buffer *leaf; leaf = path->nodes[0]; existing = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); read_extent_buffer(eb, &cmp1, (unsigned long)item, sizeof(cmp1)); read_extent_buffer(leaf, &cmp2, (unsigned long)existing, sizeof(cmp2)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { btrfs_release_path(path); goto out; } } btrfs_release_path(path); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); if (ret) goto out; if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 offset; unsigned long dest_offset; struct btrfs_key ins; ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret) goto out; dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); copy_extent_buffer(path->nodes[0], eb, dest_offset, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); if (ins.objectid > 0) { u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); /* * is this extent already allocated in the extent * allocation tree? If so, just add a reference */ ret = btrfs_lookup_extent(root, ins.objectid, ins.offset); if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset, 0); if (ret) goto out; } else { /* * insert the extent pointer in the extent * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, root, root->root_key.objectid, key->objectid, offset, &ins); if (ret) goto out; } btrfs_release_path(path); if (btrfs_file_extent_compression(eb, item)) { csum_start = ins.objectid; csum_end = csum_start + ins.offset; } else { csum_start = ins.objectid + btrfs_file_extent_offset(eb, item); csum_end = csum_start + btrfs_file_extent_num_bytes(eb, item); } ret = btrfs_lookup_csums_range(root->log_root, csum_start, csum_end - 1, &ordered_sums, 0); if (ret) goto out; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); if (!ret) ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); list_del(&sums->list); kfree(sums); } if (ret) goto out; } else { btrfs_release_path(path); } } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ ret = overwrite_item(trans, root, path, eb, slot, key); if (ret) goto out; } inode_add_bytes(inode, nbytes); ret = btrfs_update_inode(trans, root, inode); out: if (inode) iput(inode); return ret; } /* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the * inode back references, we may have to unlink inodes from directories. * * This is a helper function to do the unlink of a specific directory * item */ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct inode *dir, struct btrfs_dir_item *di) { struct inode *inode; char *name; int name_len; struct extent_buffer *leaf; struct btrfs_key location; int ret; leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &location); name_len = btrfs_dir_name_len(leaf, di); name = kmalloc(name_len, GFP_NOFS); if (!name) return -ENOMEM; read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(path); inode = read_one_inode(root, location.objectid); if (!inode) { ret = -EIO; goto out; } ret = link_to_fixup_dir(trans, root, path, location.objectid); if (ret) goto out; ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (ret) goto out; else ret = btrfs_run_delayed_items(trans, root); out: kfree(name); iput(inode); return ret; } /* * helper function to see if a given name and sequence number found * in an inode back reference are already in a directory and correctly * point to this inode */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, const char *name, int name_len) { struct btrfs_dir_item *di; struct btrfs_key location; int match = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; } else goto out; btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; } else goto out; match = 1; out: btrfs_release_path(path); return match; } /* * helper function to check a log tree for a named back reference in * an inode. This is used to decide if a back reference that is * found in the subvolume conflicts with what we find in the log. * * inode backreferences may have multiple refs in a single item, * during replay we process one reference at a time, and we don't * want to delete valid links to a file from the subvolume if that * link is also in the log. */ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, char *name, int namelen) { struct btrfs_path *path; struct btrfs_inode_ref *ref; unsigned long ptr; unsigned long ptr_end; unsigned long name_ptr; int found_name_len; int item_size; int ret; int match = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret != 0) goto out; ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); if (key->type == BTRFS_INODE_EXTREF_KEY) { if (btrfs_find_name_in_ext_backref(path, ref_objectid, name, namelen, NULL)) match = 1; goto out; } item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr_end = ptr + item_size; while (ptr < ptr_end) { ref = (struct btrfs_inode_ref *)ptr; found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); if (found_name_len == namelen) { name_ptr = (unsigned long)(ref + 1); ret = memcmp_extent_buffer(path->nodes[0], name, name_ptr, namelen); if (ret == 0) { match = 1; goto out; } } ptr = (unsigned long)(ref + 1) + found_name_len; } out: btrfs_free_path(path); return match; } static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_root *log_root, struct inode *dir, struct inode *inode, struct extent_buffer *eb, u64 inode_objectid, u64 parent_objectid, u64 ref_index, char *name, int namelen, int *search_done) { int ret; char *victim_name; int victim_name_len; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; again: /* Search old style refs */ search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret == 0) { struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; leaf = path->nodes[0]; /* are we trying to overwrite a back ref for the root directory * if so, just jump out, we're done */ if (search_key.objectid == search_key.offset) return 1; /* check all the names in this back reference to see * if they are in the log. if so, we allow them to stay * otherwise they must be unlinked as a conflict */ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); while (ptr < ptr_end) { victim_ref = (struct btrfs_inode_ref *)ptr; victim_name_len = btrfs_inode_ref_name_len(leaf, victim_ref); victim_name = kmalloc(victim_name_len, GFP_NOFS); if (!victim_name) return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)(victim_ref + 1), victim_name_len); if (!backref_in_log(log_root, &search_key, parent_objectid, victim_name, victim_name_len)) { inc_nlink(inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) return ret; ret = btrfs_run_delayed_items(trans, root); if (ret) return ret; *search_done = 1; goto again; } kfree(victim_name); ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } /* * NOTE: we have searched root tree and checked the * coresponding ref, it does not need to check again. */ *search_done = 1; } btrfs_release_path(path); /* Same search but for extended refs */ extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, inode_objectid, parent_objectid, 0, 0); if (!IS_ERR_OR_NULL(extref)) { u32 item_size; u32 cur_offset = 0; unsigned long base; struct inode *victim_parent; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *)base + cur_offset; victim_name_len = btrfs_inode_extref_name_len(leaf, extref); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; victim_name = kmalloc(victim_name_len, GFP_NOFS); if (!victim_name) return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, victim_name_len); search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = btrfs_extref_hash(parent_objectid, victim_name, victim_name_len); ret = 0; if (!backref_in_log(log_root, &search_key, parent_objectid, victim_name, victim_name_len)) { ret = -ENOENT; victim_parent = read_one_inode(root, parent_objectid); if (victim_parent) { inc_nlink(inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, victim_parent, inode, victim_name, victim_name_len); if (!ret) ret = btrfs_run_delayed_items( trans, root); } iput(victim_parent); kfree(victim_name); if (ret) return ret; *search_done = 1; goto again; } kfree(victim_name); if (ret) return ret; next: cur_offset += victim_name_len + sizeof(*extref); } *search_done = 1; } btrfs_release_path(path); /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; } btrfs_release_path(path); /* look for a conflicing name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; } btrfs_release_path(path); return 0; } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, u32 *namelen, char **name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; read_extent_buffer(eb, *name, (unsigned long)&extref->name, *namelen); *index = btrfs_inode_extref_index(eb, extref); if (parent_objectid) *parent_objectid = btrfs_inode_extref_parent(eb, extref); return 0; } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, u32 *namelen, char **name, u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); *index = btrfs_inode_ref_index(eb, ref); return 0; } /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. * root is the destination we are replaying into, and path is for temp * use by this function. (it should be released on return). */ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { struct inode *dir = NULL; struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; char *name = NULL; int namelen; int ret; int search_done = 0; int log_ref_ver = 0; u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; int ref_struct_size; ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); if (key->type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); log_ref_ver = 1; r = (struct btrfs_inode_extref *)ref_ptr; parent_objectid = btrfs_inode_extref_parent(eb, r); } else { ref_struct_size = sizeof(struct btrfs_inode_ref); parent_objectid = key->offset; } inode_objectid = key->objectid; /* * it is possible that we didn't log all the parent directories * for a given inode. If we don't find the dir, just don't * copy the back ref in. The link count fixup code will take * care of the rest */ dir = read_one_inode(root, parent_objectid); if (!dir) { ret = -ENOENT; goto out; } inode = read_one_inode(root, inode_objectid); if (!inode) { ret = -EIO; goto out; } while (ref_ptr < ref_end) { if (log_ref_ver) { ret = extref_get_fields(eb, ref_ptr, &namelen, &name, &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. */ if (!dir) dir = read_one_inode(root, parent_objectid); if (!dir) { ret = -ENOENT; goto out; } } else { ret = ref_get_fields(eb, ref_ptr, &namelen, &name, &ref_index); } if (ret) goto out; /* if we already have a perfect match, we're done */ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), ref_index, name, namelen)) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name * of the file before we add our new link. Later on, we * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ if (!search_done) { ret = __add_inode_ref(trans, root, path, log, dir, inode, eb, inode_objectid, parent_objectid, ref_index, name, namelen, &search_done); if (ret) { if (ret == 1) ret = 0; goto out; } } /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, ref_index); if (ret) goto out; btrfs_update_inode(trans, root, inode); } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; } } /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); kfree(name); iput(dir); iput(inode); return ret; } static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); return ret; } static int count_inode_extrefs(struct btrfs_root *root, struct inode *inode, struct btrfs_path *path) { int ret = 0; int name_len; unsigned int nlink = 0; u32 item_size; u32 cur_offset = 0; u64 inode_objectid = btrfs_ino(inode); u64 offset = 0; unsigned long ptr; struct btrfs_inode_extref *extref; struct extent_buffer *leaf; while (1) { ret = btrfs_find_one_extref(root, inode_objectid, offset, path, &extref, &offset); if (ret) break; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); name_len = btrfs_inode_extref_name_len(leaf, extref); nlink++; cur_offset += name_len + sizeof(*extref); } offset++; btrfs_release_path(path); } btrfs_release_path(path); if (ret < 0) return ret; return nlink; } static int count_inode_refs(struct btrfs_root *root, struct inode *inode, struct btrfs_path *path) { int ret; struct btrfs_key key; unsigned int nlink = 0; unsigned long ptr; unsigned long ptr_end; int name_len; u64 ino = btrfs_ino(inode); key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) break; if (ret > 0) { if (path->slots[0] == 0) break; path->slots[0]--; } process_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_INODE_REF_KEY) break; ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], path->slots[0]); while (ptr < ptr_end) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); ptr = (unsigned long)(ref + 1) + name_len; nlink++; } if (key.offset == 0) break; if (path->slots[0] > 0) { path->slots[0]--; goto process_slot; } key.offset--; btrfs_release_path(path); } btrfs_release_path(path); return nlink; } /* * There are a few corners where the link count of the file can't * be properly maintained during replay. So, instead of adding * lots of complexity to the log code, we just scan the backrefs * for any file that has been through replay. * * The scan will update the link count on the inode to reflect the * number of back refs found. If it goes down to zero, the iput * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_path *path; int ret; u64 nlink = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = count_inode_refs(root, inode, path); if (ret < 0) goto out; nlink = ret; ret = count_inode_extrefs(root, inode, path); if (ret == -ENOENT) ret = 0; if (ret < 0) goto out; nlink += ret; ret = 0; if (nlink != inode->i_nlink) { set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; if (inode->i_nlink == 0) { if (S_ISDIR(inode->i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); if (ret) goto out; } ret = insert_orphan_item(trans, root, ino); } out: btrfs_free_path(path); return ret; } static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { int ret; struct btrfs_key key; struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; if (ret == 1) { if (path->slots[0] == 0) break; path->slots[0]--; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) break; ret = btrfs_del_item(trans, root, path); if (ret) goto out; btrfs_release_path(path); inode = read_one_inode(root, key.offset); if (!inode) return -EIO; ret = fixup_inode_link_count(trans, root, inode); iput(inode); if (ret) goto out; /* * fixup on a directory may create new entries, * make sure we always look for the highset possible * offset */ key.offset = (u64)-1; } ret = 0; out: btrfs_release_path(path); return ret; } /* * record a given inode in the fixup dir so we can check its link * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid) { struct btrfs_key key; int ret = 0; struct inode *inode; inode = read_one_inode(root, objectid); if (!inode) return -EIO; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); key.offset = objectid; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); if (ret == 0) { if (!inode->i_nlink) set_nlink(inode, 1); else inc_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; } else { BUG(); /* Logic Error */ } iput(inode); return ret; } /* * when replaying the log for a directory, we only insert names * for inodes that actually exist. This means an fsync on a directory * does not implicitly fsync all the new files in it */ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 index, char *name, int name_len, u8 type, struct btrfs_key *location) { struct inode *inode; struct inode *dir; int ret; inode = read_one_inode(root, location->objectid); if (!inode) return -ENOENT; dir = read_one_inode(root, dirid); if (!dir) { iput(inode); return -EIO; } ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); /* FIXME, put inode into FIXUP list */ iput(inode); iput(dir); return ret; } /* * take a single entry in a log directory item and replay it into * the subvolume. * * if a conflicting item exists in the subdirectory already, * the inode it points to is unlinked and put into the link count * fix up tree. * * If a name from the log points to a file or directory that does * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. */ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, struct btrfs_dir_item *di, struct btrfs_key *key) { char *name; int name_len; struct btrfs_dir_item *dst_di; struct btrfs_key found_key; struct btrfs_key log_key; struct inode *dir; u8 log_type; int exists; int ret = 0; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); dir = read_one_inode(root, key->objectid); if (!dir) return -EIO; name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; goto out; } log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); if (exists == 0) exists = 1; else exists = 0; btrfs_release_path(path); if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, name, name_len, 1); } else if (key->type == BTRFS_DIR_INDEX_KEY) { dst_di = btrfs_lookup_dir_index_item(trans, root, path, key->objectid, key->offset, name, name_len, 1); } else { /* Corruption */ ret = -EINVAL; goto out; } if (IS_ERR_OR_NULL(dst_di)) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ if (key->type != BTRFS_DIR_INDEX_KEY) goto out; goto insert; } btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); /* the existing item matches the logged item */ if (found_key.objectid == log_key.objectid && found_key.type == log_key.type && found_key.offset == log_key.offset && btrfs_dir_type(path->nodes[0], dst_di) == log_type) { goto out; } /* * don't drop the conflicting directory entry if the inode * for the new entry doesn't exist */ if (!exists) goto out; ret = drop_one_dir_item(trans, root, path, dir, dst_di); if (ret) goto out; if (key->type == BTRFS_DIR_INDEX_KEY) goto insert; out: btrfs_release_path(path); if (!ret && update_size) { btrfs_i_size_write(dir, dir->i_size + name_len * 2); ret = btrfs_update_inode(trans, root, dir); } kfree(name); iput(dir); return ret; insert: btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); if (ret && ret != -ENOENT) goto out; update_size = false; ret = 0; goto out; } /* * find all the names in a directory item and reconcile them into * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than * one name in a directory item, but the same code gets used for * both directory index types */ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { int ret; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; int name_len; unsigned long ptr; unsigned long ptr_end; ptr = btrfs_item_ptr_offset(eb, slot); ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; if (verify_dir_item(root, eb, di)) return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); if (ret) return ret; ptr = (unsigned long)(di + 1); ptr += name_len; } return 0; } /* * directory replay has two parts. There are the standard directory * items in the log copied from the subvolume, and range items * created in the log while the subvolume was logged. * * The range items tell us which parts of the key space the log * is authoritative for. During replay, if a key in the subvolume * directory is in a logged range item, but not actually in the log * that means it was deleted from the directory before the fsync * and should be removed. */ static noinline int find_dir_range(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, int key_type, u64 *start_ret, u64 *end_ret) { struct btrfs_key key; u64 found_end; struct btrfs_dir_log_item *item; int ret; int nritems; if (*start_ret == (u64)-1) return 1; key.objectid = dirid; key.type = key_type; key.offset = *start_ret; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { if (path->slots[0] == 0) goto out; path->slots[0]--; } if (ret != 0) btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type != key_type || key.objectid != dirid) { ret = 1; goto next; } item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); found_end = btrfs_dir_log_end(path->nodes[0], item); if (*start_ret >= key.offset && *start_ret <= found_end) { ret = 0; *start_ret = key.offset; *end_ret = found_end; goto out; } ret = 1; next: /* check the next slot in the tree to see if it is a valid item */ nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) goto out; } else { path->slots[0]++; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type != key_type || key.objectid != dirid) { ret = 1; goto out; } item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); found_end = btrfs_dir_log_end(path->nodes[0], item); *start_ret = key.offset; *end_ret = found_end; ret = 0; out: btrfs_release_path(path); return ret; } /* * this looks for a given directory item in the log. If the directory * item is not in the log, the item is removed and the inode it points * to is unlinked */ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, struct inode *dir, struct btrfs_key *dir_key) { int ret; struct extent_buffer *eb; int slot; u32 item_size; struct btrfs_dir_item *di; struct btrfs_dir_item *log_di; int name_len; unsigned long ptr; unsigned long ptr_end; char *name; struct inode *inode; struct btrfs_key location; again: eb = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size_nr(eb, slot); ptr = btrfs_item_ptr_offset(eb, slot); ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; if (verify_dir_item(root, eb, di)) { ret = -EIO; goto out; } name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; goto out; } read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); log_di = NULL; if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { log_di = btrfs_lookup_dir_item(trans, log, log_path, dir_key->objectid, name, name_len, 0); } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, dir_key->offset, name, name_len, 0); } if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); inode = read_one_inode(root, location.objectid); if (!inode) { kfree(name); return -EIO; } ret = link_to_fixup_dir(trans, root, path, location.objectid); if (ret) { kfree(name); iput(inode); goto out; } inc_nlink(inode); ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) ret = btrfs_run_delayed_items(trans, root); kfree(name); iput(inode); if (ret) goto out; /* there might still be more names under this key * check and repeat if required */ ret = btrfs_search_slot(NULL, root, dir_key, path, 0, 0); if (ret == 0) goto again; ret = 0; goto out; } else if (IS_ERR(log_di)) { kfree(name); return PTR_ERR(log_di); } btrfs_release_path(log_path); kfree(name); ptr = (unsigned long)(di + 1); ptr += name_len; } ret = 0; out: btrfs_release_path(path); btrfs_release_path(log_path); return ret; } /* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It * scans the log to find ranges of keys that log is authoritative for, * and then scans the directory to find items in those ranges that are * not present in the log. * * Anything we don't find in the log is unlinked and removed from the * directory. */ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all) { u64 range_start; u64 range_end; int key_type = BTRFS_DIR_LOG_ITEM_KEY; int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; struct inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_ITEM_KEY; log_path = btrfs_alloc_path(); if (!log_path) return -ENOMEM; dir = read_one_inode(root, dirid); /* it isn't an error if the inode isn't there, that can happen * because we replay the deletes before we copy in the inode item * from the log */ if (!dir) { btrfs_free_path(log_path); return 0; } again: range_start = 0; range_end = 0; while (1) { if (del_all) range_end = (u64)-1; else { ret = find_dir_range(log, path, dirid, key_type, &range_start, &range_end); if (ret != 0) break; } dir_key.offset = range_start; while (1) { int nritems; ret = btrfs_search_slot(NULL, root, &dir_key, path, 0, 0); if (ret < 0) goto out; nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) break; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.objectid != dirid || found_key.type != dir_key.type) goto next_type; if (found_key.offset > range_end) break; ret = check_item_in_log(trans, root, log, path, log_path, dir, &found_key); if (ret) goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; } btrfs_release_path(path); if (range_end == (u64)-1) break; range_start = range_end + 1; } next_type: ret = 0; if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { key_type = BTRFS_DIR_LOG_INDEX_KEY; dir_key.type = BTRFS_DIR_INDEX_KEY; btrfs_release_path(path); goto again; } out: btrfs_release_path(path); btrfs_free_path(log_path); iput(dir); return ret; } /* * the process_func used to replay items from the log tree. This * gets called in two different stages. The first stage just looks * for inodes and makes sure they are all copied into the subvolume. * * The second stage copies all the other item types from the log into * the subvolume. The two stage approach is slower, but gets rid of * lots of complexity around inodes referencing other inodes that exist * only in the log (references come from either directory items or inode * back refs). */ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { int nritems; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; int level; int i; int ret; ret = btrfs_read_buffer(eb, gen); if (ret) return ret; level = btrfs_header_level(eb); if (level != 0) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { btrfs_item_key_to_cpu(eb, &key, i); /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { struct btrfs_inode_item *inode_item; u32 mode; inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, root, log, path, key.objectid, 0); if (ret) break; } ret = overwrite_item(wc->trans, root, path, eb, i, &key); if (ret) break; /* for regular files, make sure corresponding * orhpan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { ret = insert_orphan_item(wc->trans, root, key.objectid); if (ret) break; } ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); if (ret) break; } if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { ret = replay_one_dir_item(wc->trans, root, path, eb, i, &key); if (ret) break; } if (wc->stage < LOG_WALK_REPLAY_ALL) continue; /* these keys are simply copied */ if (key.type == BTRFS_XATTR_ITEM_KEY) { ret = overwrite_item(wc->trans, root, path, eb, i, &key); if (ret) break; } else if (key.type == BTRFS_INODE_REF_KEY || key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); if (ret && ret != -ENOENT) break; ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); if (ret) break; } else if (key.type == BTRFS_DIR_ITEM_KEY) { ret = replay_one_dir_item(wc->trans, root, path, eb, i, &key); if (ret) break; } } btrfs_free_path(path); return ret; } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { u64 root_owner; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; u32 blocksize; int ret = 0; WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); while (*level > 0) { WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; WARN_ON(btrfs_header_level(cur) != *level); if (path->slots[*level] >= btrfs_header_nritems(cur)) break; bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) return -ENOMEM; if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); if (ret) { free_extent_buffer(next); return ret; } path->slots[*level]++; if (wc->free) { ret = btrfs_read_buffer(next, ptr_gen); if (ret) { free_extent_buffer(next); return ret; } if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); if (ret) { free_extent_buffer(next); return ret; } } free_extent_buffer(next); continue; } ret = btrfs_read_buffer(next, ptr_gen); if (ret) { free_extent_buffer(next); return ret; } WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; *level = btrfs_header_level(next); path->slots[*level] = 0; cond_resched(); } WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); return 0; } static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { u64 root_owner; int i; int slot; int ret; for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { path->slots[i]++; *level = i; WARN_ON(*level == 0); return 0; } else { struct extent_buffer *parent; if (path->nodes[*level] == root->node) parent = path->nodes[*level]; else parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); if (ret) return ret; if (wc->free) { struct extent_buffer *next; next = path->nodes[*level]; if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); if (ret) return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level = i + 1; } } return 1; } /* * drop the reference count on the tree rooted at 'snap'. This traverses * the tree freeing any blocks that have a ref count of zero after being * decremented. */ static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { int ret = 0; int wret; int level; struct btrfs_path *path; int orig_level; path = btrfs_alloc_path(); if (!path) return -ENOMEM; level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; extent_buffer_get(log->node); path->slots[level] = 0; while (1) { wret = walk_down_log_tree(trans, log, path, &level, wc); if (wret > 0) break; if (wret < 0) { ret = wret; goto out; } wret = walk_up_log_tree(trans, log, path, &level, wc); if (wret > 0) break; if (wret < 0) { ret = wret; goto out; } } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { ret = wc->process_func(log, path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level])); if (ret) goto out; if (wc->free) { struct extent_buffer *next; next = path->nodes[orig_level]; if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); clean_tree_block(trans, log, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); if (ret) goto out; } } out: btrfs_free_path(path); return ret; } /* * helper function to update the item for a given subvolumes log root * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; if (log->log_transid == 1) { /* insert root item on the first sync */ ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, &log->root_key, &log->root_item); } else { ret = btrfs_update_root(trans, log->fs_info->log_root_tree, &log->root_key, &log->root_item); } return ret; } static int wait_log_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); int index = transid % 2; int ret = 0; /* * we only allow two pending log transactions at a time, * so we know that if ours is more than 2 older than the * current transaction, we're done */ do { if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == trans->transid) { ret = -EAGAIN; break; } prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); if (root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); } while (root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); return ret; } static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != trans->transid && atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != trans->transid && atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } } /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, * you know that any inodes previously logged are safely on disk only * if it returns 0. * * Any other return value means you need to call btrfs_commit_transaction. * Some of the edge cases for fsyncing directories that have had unlinks * or renames done in the past mean that sometimes the only safe * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int index1; int index2; int mark; int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; unsigned long log_transid = 0; struct blk_plug plug; mutex_lock(&root->log_mutex); log_transid = root->log_transid; index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { ret = wait_log_commit(trans, root, root->log_transid); mutex_unlock(&root->log_mutex); return ret; } atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); } wait_for_writer(trans, root); if (batch == atomic_read(&root->log_batch)) break; } /* bail out if we need to do a full commit */ if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == trans->transid) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } if (log_transid % 2 == 0) mark = EXTENT_DIRTY; else mark = EXTENT_NEW; /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ blk_start_plug(&plug); ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } btrfs_set_root_node(&log->root_item, log->node); root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; smp_mb(); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to * new positions. so it's safe to allow log writers to go in. */ mutex_unlock(&root->log_mutex); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { smp_mb(); if (waitqueue_active(&log_root_tree->log_writer_wait)) wake_up(&log_root_tree->log_writer_wait); } if (ret) { blk_finish_plug(&plug); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; } index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); ret = wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { wait_log_commit(trans, log_root_tree, log_root_tree->log_transid - 1); } wait_for_writer(trans, log_root_tree); /* * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == trans->transid) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; } ret = btrfs_write_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages, EXTENT_NEW | EXTENT_DIRTY); btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_transid++; smp_mb(); mutex_unlock(&log_root_tree->log_mutex); /* * nobody else is going to jump in and write the the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop * in and cause problems either. */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } mutex_lock(&root->log_mutex); if (root->last_log_commit < log_transid) root->last_log_commit = log_transid; mutex_unlock(&root->log_mutex); out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: atomic_set(&root->log_commit[index1], 0); smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; } static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; u64 start; u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer }; ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) btrfs_abort_transaction(trans, log, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, NULL); if (ret) break; clear_extent_bits(&log->dirty_log_pages, start, end, EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } /* * We may have short-circuited the log tree with the full commit logic * and left ordered extents on our list, so clear these out to keep us * from leaking inodes and memory. */ btrfs_free_logged_extents(log, 0); btrfs_free_logged_extents(log, 1); free_extent_buffer(log->node); kfree(log); } /* * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->log_root) { free_log_tree(trans, root->log_root); root->log_root = NULL; } return 0; } int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { if (fs_info->log_root_tree) { free_log_tree(trans, fs_info->log_root_tree); fs_info->log_root_tree = NULL; } return 0; } /* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * * create file X in dir Y * link file X to X.link in dir Y * fsync file X * unlink file X but leave X.link * fsync dir Y * * After a crash we would expect only X.link to exist. But file X * didn't get fsync'd again so the log has back refs for X and X.link. * * We solve this by removing directory entries and inode backrefs from the * log when a file that was logged in the current transaction is * unlinked. Any later fsync will include the updated log entries, and * we'll be able to reconstruct the proper directory items from backrefs. * * This optimizations allows us to avoid relogging the entire inode * or the entire directory. */ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, u64 index) { struct btrfs_root *log; struct btrfs_dir_item *di; struct btrfs_path *path; int ret; int err = 0; int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); if (BTRFS_I(dir)->logged_trans < trans->transid) return 0; ret = join_running_log_trans(root); if (ret) return 0; mutex_lock(&BTRFS_I(dir)->log_mutex); log = root->log_root; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out_unlock; } di = btrfs_lookup_dir_item(trans, log, path, dir_ino, name, name_len, -1); if (IS_ERR(di)) { err = PTR_ERR(di); goto fail; } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; if (ret) { err = ret; goto fail; } } btrfs_release_path(path); di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, index, name, name_len, -1); if (IS_ERR(di)) { err = PTR_ERR(di); goto fail; } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; if (ret) { err = ret; goto fail; } } /* update the directory size in the log to reflect the names * we have removed */ if (bytes_del) { struct btrfs_key key; key.objectid = dir_ino; key.offset = 0; key.type = BTRFS_INODE_ITEM_KEY; btrfs_release_path(path); ret = btrfs_search_slot(trans, log, &key, path, 0, 1); if (ret < 0) { err = ret; goto fail; } if (ret == 0) { struct btrfs_inode_item *item; u64 i_size; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); i_size = btrfs_inode_size(path->nodes[0], item); if (i_size > bytes_del) i_size -= bytes_del; else i_size = 0; btrfs_set_inode_size(path->nodes[0], item, i_size); btrfs_mark_buffer_dirty(path->nodes[0]); } else ret = 0; btrfs_release_path(path); } fail: btrfs_free_path(path); out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, root, ret); btrfs_end_log_trans(root); return err; } /* see comments for btrfs_del_dir_entries_in_log */ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid) { struct btrfs_root *log; u64 index; int ret; if (BTRFS_I(inode)->logged_trans < trans->transid) return 0; ret = join_running_log_trans(root); if (ret) return 0; log = root->log_root; mutex_lock(&BTRFS_I(inode)->log_mutex); ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, root, ret); btrfs_end_log_trans(root); return ret; } /* * creates a range item in the log for 'dirid'. first_offset and * last_offset tell us which parts of the key space the log should * be considered authoritative for. */ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, int key_type, u64 dirid, u64 first_offset, u64 last_offset) { int ret; struct btrfs_key key; struct btrfs_dir_log_item *item; key.objectid = dirid; key.offset = first_offset; if (key_type == BTRFS_DIR_ITEM_KEY) key.type = BTRFS_DIR_LOG_ITEM_KEY; else key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); if (ret) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); return 0; } /* * log all the items included in the current transaction for a given * directory. This also creates the range items in the log tree required * to replay anything deleted before the fsync */ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; int err = 0; int ret; int i; int nritems; u64 first_offset = min_offset; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); log = root->log_root; min_key.objectid = ino; min_key.type = key_type; min_key.offset = min_offset; path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* * we didn't find anything from this transaction, see if there * is anything at all */ if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { min_key.objectid = ino; min_key.type = key_type; min_key.offset = (u64)-1; btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret < 0) { btrfs_release_path(path); return ret; } ret = btrfs_previous_item(root, path, ino, key_type); /* if ret == 0 there are items for this type, * create a range to tell us the last key of this type. * otherwise, there are no items in this directory after * *min_offset, and we create a range to indicate that. */ if (ret == 0) { struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (key_type == tmp.type) first_offset = max(min_offset, tmp.offset) + 1; } goto done; } /* go backward to find any previous key */ ret = btrfs_previous_item(root, path, ino, key_type); if (ret == 0) { struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (key_type == tmp.type) { first_offset = tmp.offset; ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); if (ret) { err = ret; goto done; } } } btrfs_release_path(path); /* find the first key from this transaction again */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (WARN_ON(ret != 0)) goto done; /* * we have a block from this transaction, log every item in it * from our directory */ while (1) { struct btrfs_key tmp; src = path->nodes[0]; nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) { btrfs_item_key_to_cpu(src, &min_key, i); if (min_key.objectid != ino || min_key.type != key_type) goto done; ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { err = ret; goto done; } } path->slots[0] = nritems; /* * look ahead to the next item and see if it is also * from this directory and from this transaction */ ret = btrfs_next_leaf(root, path); if (ret == 1) { last_offset = (u64)-1; goto done; } btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (tmp.objectid != ino || tmp.type != key_type) { last_offset = (u64)-1; goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); if (ret) err = ret; else last_offset = tmp.offset; goto done; } } done: btrfs_release_path(path); btrfs_release_path(dst_path); if (err == 0) { *last_offset_ret = last_offset; /* * insert the log range keys to indicate where the log * is valid */ ret = insert_dir_log_key(trans, log, path, key_type, ino, first_offset, last_offset); if (ret) err = ret; } return err; } /* * logging directories is very similar to logging inodes, We find all the items * from the current transaction and write them to the log. * * The recovery code scans the directory in the subvolume, and if it finds a * key in the range logged that is not present in the log tree, then it means * that dir entry was unlinked during the transaction. * * In order for that scan to work, we must include one key smaller than * the smallest logged by this transaction and one key larger than the largest * key logged by this transaction. */ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path) { u64 min_key; u64 max_key; int ret; int key_type = BTRFS_DIR_ITEM_KEY; again: min_key = 0; max_key = 0; while (1) { ret = log_dir_items(trans, root, inode, path, dst_path, key_type, min_key, &max_key); if (ret) return ret; if (max_key == (u64)-1) break; min_key = max_key + 1; } if (key_type == BTRFS_DIR_ITEM_KEY) { key_type = BTRFS_DIR_INDEX_KEY; goto again; } return 0; } /* * a helper function to drop items from the log before we relog an * inode. max_key_type indicates the highest item type to remove. * This cannot be run for file data extents because it does not * free the extents they point to. */ static int drop_objectid_items(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 objectid, int max_key_type) { int ret; struct btrfs_key key; struct btrfs_key found_key; int start_slot; key.objectid = objectid; key.type = max_key_type; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); BUG_ON(ret == 0); /* Logic error */ if (ret < 0) break; if (path->slots[0] == 0) break; path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.objectid != objectid) break; found_key.offset = 0; found_key.type = 0; ret = btrfs_bin_search(path->nodes[0], &found_key, 0, &start_slot); ret = btrfs_del_items(trans, log, path, start_slot, path->slots[0] - start_slot + 1); /* * If start slot isn't 0 then we don't need to re-search, we've * found the last guy with the objectid in this tree. */ if (ret || start_slot != 0) break; btrfs_release_path(path); } btrfs_release_path(path); if (ret > 0) ret = 0; return ret; } static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { struct btrfs_map_token token; btrfs_init_map_token(&token); if (log_inode_only) { /* set the generation to zero so the recover code * can tell the difference between an logging * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ btrfs_set_token_inode_generation(leaf, item, 0, &token); btrfs_set_token_inode_size(leaf, item, 0, &token); } else { btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, &token); btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); } btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), inode->i_atime.tv_sec, &token); btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), inode->i_atime.tv_nsec, &token); btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), inode->i_mtime.tv_sec, &token); btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), inode->i_mtime.tv_nsec, &token); btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), inode->i_ctime.tv_sec, &token); btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); btrfs_set_token_inode_block_group(leaf, item, 0, &token); } static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct inode *inode) { struct btrfs_inode_item *inode_item; int ret; ret = btrfs_insert_empty_item(trans, log, path, &BTRFS_I(inode)->location, sizeof(*inode_item)); if (ret && ret != -EEXIST) return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); btrfs_release_path(path); return 0; } static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only) { unsigned long src_offset; unsigned long dst_offset; struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; char *ins_data; int i; struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; bool has_extents = false; bool need_find_last_extent = (*last_extent == 0); bool done = false; INIT_LIST_HEAD(&ordered_sums); ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); if (!ins_data) return -ENOMEM; first_key.objectid = (u64)-1; ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); for (i = 0; i < nr; i++) { ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); } ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes, nr); if (ret) { kfree(ins_data); return ret; } for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_path->slots[0]); src_offset = btrfs_item_ptr_offset(src, start_slot + i); if ((i == (nr - 1))) last_key = ins_keys[i]; if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, inode, inode_only == LOG_INODE_EXISTS); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, src_offset, ins_sizes[i]); } /* * We set need_find_last_extent here in case we know we were * processing other items and then walk into the first extent in * the inode. If we don't hit an extent then nothing changes, * we'll do the last search the next time around. */ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { has_extents = true; if (need_find_last_extent && first_key.objectid == (u64)-1) first_key = ins_keys[i]; } else { need_find_last_extent = false; } /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again */ if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, struct btrfs_file_extent_item); if (btrfs_file_extent_generation(src, extent) < trans->transid) continue; found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG) { u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); /* ds == 0 is a hole */ if (ds == 0) continue; dl = btrfs_file_extent_disk_num_bytes(src, extent); cs = btrfs_file_extent_offset(src, extent); cl = btrfs_file_extent_num_bytes(src, extent); if (btrfs_file_extent_compression(src, extent)) { cs = 0; cl = dl; } ret = btrfs_lookup_csums_range( log->fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); if (ret) { btrfs_release_path(dst_path); kfree(ins_data); return ret; } } } } btrfs_mark_buffer_dirty(dst_path->nodes[0]); btrfs_release_path(dst_path); kfree(ins_data); /* * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); if (!ret) ret = btrfs_csum_file_blocks(trans, log, sums); list_del(&sums->list); kfree(sums); } if (!has_extents) return ret; /* * Because we use btrfs_search_forward we could skip leaves that were * not modified and then assume *last_extent is valid when it really * isn't. So back up to the previous leaf and read the end of the last * extent before we go and fill in holes. */ if (need_find_last_extent) { u64 len; ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); if (ret < 0) return ret; if (ret) goto fill_holes; if (src_path->slots[0]) src_path->slots[0]--; src = src_path->nodes[0]; btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) goto fill_holes; extent = btrfs_item_ptr(src, src_path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { len = btrfs_file_extent_inline_len(src, src_path->slots[0], extent); *last_extent = ALIGN(key.offset + len, log->sectorsize); } else { len = btrfs_file_extent_num_bytes(src, extent); *last_extent = key.offset + len; } } fill_holes: /* So we did prev_leaf, now we need to move to the next leaf, but a few * things could have happened * * 1) A merge could have happened, so we could currently be on a leaf * that holds what we were copying in the first place. * 2) A split could have happened, and now not all of the items we want * are on the same leaf. * * So we need to adjust how we search for holes, we need to drop the * path and re-search for the first extent key we found, and then walk * forward until we hit the last one we copied. */ if (need_find_last_extent) { /* btrfs_prev_leaf could return 1 without releasing the path */ btrfs_release_path(src_path); ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, src_path, 0, 0); if (ret < 0) return ret; ASSERT(ret == 0); src = src_path->nodes[0]; i = src_path->slots[0]; } else { i = start_slot; } /* * Ok so here we need to go through and fill in any holes we may have * to make sure that holes are punched for those areas in case they had * extents previously. */ while (!done) { u64 offset, len; u64 extent_end; if (i >= btrfs_header_nritems(src_path->nodes[0])) { ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); if (ret < 0) return ret; ASSERT(ret == 0); src = src_path->nodes[0]; i = 0; } btrfs_item_key_to_cpu(src, &key, i); if (!btrfs_comp_cpu_keys(&key, &last_key)) done = true; if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { i++; continue; }


context:
space:
mode: