aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch/flow_netlink.c
blob: 2bc1bc1aca3bfc2fd149ba88c86b4eaf88aed00e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
/*
 * Copyright (c) 2007-2013 Nicira, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 */

#include "flow.h"
#include "datapath.h"
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <net/llc_pdu.h>
#include <linux/kernel.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/llc.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>

#include "flow_netlink.h"

static void update_range__(struct sw_flow_match *match,
			   size_t offset, size_t size, bool is_mask)
{
	struct sw_flow_key_range *range = NULL;
	size_t start = rounddown(offset, sizeof(long));
	size_t end = roundup(offset + size, sizeof(long));

	if (!is_mask)
		range = &match->range;
	else if (match->mask)
		range = &match->mask->range;

	if (!range)
		return;

	if (range->start == range->end) {
		range->start = start;
		range->end = end;
		return;
	}

	if (range->start > start)
		range->start = start;

	if (range->end < end)
		range->end = end;
}

#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
	do { \
		update_range__(match, offsetof(struct sw_flow_key, field),  \
				     sizeof((match)->key->field), is_mask); \
		if (is_mask) {						    \
			if ((match)->mask)				    \
				(match)->mask->key.field = value;	    \
		} else {                                                    \
			(match)->key->field = value;		            \
		}                                                           \
	} while (0)

#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
	do { \
		update_range__(match, offsetof(struct sw_flow_key, field),  \
				len, is_mask);                              \
		if (is_mask) {						    \
			if ((match)->mask)				    \
				memcpy(&(match)->mask->key.field, value_p, len);\
		} else {                                                    \
			memcpy(&(match)->key->field, value_p, len);         \
		}                                                           \
	} while (0)

static u16 range_n_bytes(const struct sw_flow_key_range *range)
{
	return range->end - range->start;
}

static bool match_validate(const struct sw_flow_match *match,
			   u64 key_attrs, u64 mask_attrs)
{
	u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
	u64 mask_allowed = key_attrs;  /* At most allow all key attributes */

	/* The following mask attributes allowed only if they
	 * pass the validation tests. */
	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
			| (1 << OVS_KEY_ATTR_IPV6)
			| (1 << OVS_KEY_ATTR_TCP)
			| (1 << OVS_KEY_ATTR_TCP_FLAGS)
			| (1 << OVS_KEY_ATTR_UDP)
			| (1 << OVS_KEY_ATTR_SCTP)
			| (1 << OVS_KEY_ATTR_ICMP)
			| (1 << OVS_KEY_ATTR_ICMPV6)
			| (1 << OVS_KEY_ATTR_ARP)
			| (1 << OVS_KEY_ATTR_ND));

	/* Always allowed mask fields. */
	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
		       | (1 << OVS_KEY_ATTR_IN_PORT)
		       | (1 << OVS_KEY_ATTR_ETHERTYPE));

	/* Check key attributes. */
	if (match->key->eth.type == htons(ETH_P_ARP)
			|| match->key->eth.type == htons(ETH_P_RARP)) {
		key_expected |= 1 << OVS_KEY_ATTR_ARP;
		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
	}

	if (match->key->eth.type == htons(ETH_P_IP)) {
		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;

		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
			if (match->key->ip.proto == IPPROTO_UDP) {
				key_expected |= 1 << OVS_KEY_ATTR_UDP;
				if (match->mask && (match->mask->key.ip.proto == 0xff))
					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
			}

			if (match->key->ip.proto == IPPROTO_SCTP) {
				key_expected |= 1 << OVS_KEY_ATTR_SCTP;
				if (match->mask && (match->mask->key.ip.proto == 0xff))
					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
			}

			if (match->key->ip.proto == IPPROTO_TCP) {
				key_expected |= 1 << OVS_KEY_ATTR_TCP;
				key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
				if (match->mask && (match->mask->key.ip.proto == 0xff)) {
					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
					mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
				}
			}

			if (match->key->ip.proto == IPPROTO_ICMP) {
				key_expected |= 1 << OVS_KEY_ATTR_ICMP;
				if (match->mask && (match->mask->key.ip.proto == 0xff))
					mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
			}
		}
	}

	if (match->key->eth.type == htons(ETH_P_IPV6)) {
		key_expected |= 1 << OVS_KEY_ATTR_IPV6;
		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;

		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
			if (match->key->ip.proto == IPPROTO_UDP) {
				key_expected |= 1 << OVS_KEY_ATTR_UDP;
				if (match->mask && (match->mask->key.ip.proto == 0xff))
					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
			}

			if (match->key->ip.proto == IPPROTO_SCTP) {
				key_expected |= 1 << OVS_KEY_ATTR_SCTP;
				if (match->mask && (match->mask->key.ip.proto == 0xff))
					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
			}

			if (match->key->ip.proto == IPPROTO_TCP) {
				key_expected |= 1 << OVS_KEY_ATTR_TCP;
				key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
				if (match->mask && (match->mask->key.ip.proto == 0xff)) {
					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
					mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
				}
			}

			if (match->key->ip.proto == IPPROTO_ICMPV6) {
				key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
				if (match->mask && (match->mask->key.ip.proto == 0xff))
					mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;

				if (match->key->ipv6.tp.src ==
						htons(NDISC_NEIGHBOUR_SOLICITATION) ||
				    match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
					key_expected |= 1 << OVS_KEY_ATTR_ND;
					if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
						mask_allowed |= 1 << OVS_KEY_ATTR_ND;
				}
			}
		}
	}

	if ((key_attrs & key_expected) != key_expected) {
		/* Key attributes check failed. */
		OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
				key_attrs, key_expected);
		return false;
	}

	if ((mask_attrs & mask_allowed) != mask_attrs) {
		/* Mask attributes check failed. */
		OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
				mask_attrs, mask_allowed);
		return false;
	}

	return true;
}

/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
	[OVS_KEY_ATTR_ENCAP] = -1,
	[OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
	[OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
	[OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
	[OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
	[OVS_KEY_ATTR_VLAN] = sizeof(__be16),
	[OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
	[OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
	[OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
	[OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
	[OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16),
	[OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
	[OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
	[OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
	[OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
	[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
	[OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
	[OVS_KEY_ATTR_TUNNEL] = -1,
};

static bool is_all_zero(const u8 *fp, size_t size)
{
	int i;

	if (!fp)
		return false;

	for (i = 0; i < size; i++)
		if (fp[i])
			return false;

	return true;
}

static int __parse_flow_nlattrs(const struct nlattr *attr,
				const struct nlattr *a[],
				u64 *attrsp, bool nz)
{
	const struct nlattr *nla;
	u64 attrs;
	int rem;

	attrs = *attrsp;
	nla_for_each_nested(nla, attr, rem) {
		u16 type = nla_type(nla);
		int expected_len;

		if (type > OVS_KEY_ATTR_MAX) {
			OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
				  type, OVS_KEY_ATTR_MAX);
			return -EINVAL;
		}

		if (attrs & (1 << type)) {
			OVS_NLERR("Duplicate key attribute (type %d).\n", type);
			return -EINVAL;
		}

		expected_len = ovs_key_lens[type];
		if (nla_len(nla) != expected_len && expected_len != -1) {
			OVS_NLERR("Key attribute has unexpected length (type=%d"
				  ", length=%d, expected=%d).\n", type,
				  nla_len(nla), expected_len);
			return -EINVAL;
		}

		if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
			attrs |= 1 << type;
			a[type] = nla;
		}
	}
	if (rem) {
		OVS_NLERR("Message has %d unknown bytes.\n", rem);
		return -EINVAL;
	}

	*attrsp = attrs;
	return 0;
}

static int parse_flow_mask_nlattrs(const struct nlattr *attr,
				   const struct nlattr *a[], u64 *attrsp)
{
	return __parse_flow_nlattrs(attr, a, attrsp, true);
}

static int parse_flow_nlattrs(const struct nlattr *attr,
			      const struct nlattr *a[], u64 *attrsp)
{
	return __parse_flow_nlattrs(attr, a, attrsp, false);
}

static int ipv4_tun_from_nlattr(const struct nlattr *attr,
				struct sw_flow_match *match, bool is_mask)
{
	struct nlattr *a;
	int rem;
	bool ttl = false;
	__be16 tun_flags = 0;

	nla_for_each_nested(a, attr, rem) {
		int type = nla_type(a);
		static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
			[OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
			[OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
			[OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
			[OVS_TUNNEL_KEY_ATTR_TOS] = 1,
			[OVS_TUNNEL_KEY_ATTR_TTL] = 1,
			[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
		};

		if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
			OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
			type, OVS_TUNNEL_KEY_ATTR_MAX);
			return -EINVAL;
		}

		if (ovs_tunnel_key_lens[type] != nla_len(a)) {
			OVS_NLERR("IPv4 tunnel attribute type has unexpected "
				  " length (type=%d, length=%d, expected=%d).\n",
				  type, nla_len(a), ovs_tunnel_key_lens[type]);
			return -EINVAL;
		}

		switch (type) {
		case OVS_TUNNEL_KEY_ATTR_ID:
			SW_FLOW_KEY_PUT(match, tun_key.tun_id,
					nla_get_be64(a), is_mask);
			tun_flags |= TUNNEL_KEY;
			break;
		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
			SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
					nla_get_be32(a), is_mask);
			break;
		case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
			SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
					nla_get_be32(a), is_mask);
			break;
		case OVS_TUNNEL_KEY_ATTR_TOS:
			SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
					nla_get_u8(a), is_mask);
			break;
		case OVS_TUNNEL_KEY_ATTR_TTL:
			SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
					nla_get_u8(a), is_mask);
			ttl = true;
			break;
		case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
			tun_flags |= TUNNEL_DONT_FRAGMENT;
			break;
		case OVS_TUNNEL_KEY_ATTR_CSUM:
			tun_flags |= TUNNEL_CSUM;
			break;
		default:
			return -EINVAL;
		}
	}

	SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);

	if (rem > 0) {
		OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
		return -EINVAL;
	}

	if (!is_mask) {
		if (!match->key->tun_key.ipv4_dst) {
			OVS_NLERR("IPv4 tunnel destination address is zero.\n");
			return -EINVAL;
		}

		if (!ttl) {
			OVS_NLERR("IPv4 tunnel TTL not specified.\n");
			return -EINVAL;
		}
	}

	return 0;
}

static int ipv4_tun_to_nlattr(struct sk_buff *skb,
			      const struct ovs_key_ipv4_tunnel *tun_key,
			      const struct ovs_key_ipv4_tunnel *output)
{
	struct nlattr *nla;

	nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
	if (!nla)
		return -EMSGSIZE;

	if (output->tun_flags & TUNNEL_KEY &&
	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
		return -EMSGSIZE;
	if (output->ipv4_src &&
		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
		return -EMSGSIZE;
	if (output->ipv4_dst &&
		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
		return -EMSGSIZE;
	if (output->ipv4_tos &&
		nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
		return -EMSGSIZE;
	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
		return -EMSGSIZE;
	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
		return -EMSGSIZE;
	if ((output->tun_flags & TUNNEL_CSUM) &&
		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
		return -EMSGSIZE;

	nla_nest_end(skb, nla);
	return 0;
}


static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
				 const struct nlattr **a, bool is_mask)
{
	if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
		SW_FLOW_KEY_PUT(match, phy.priority,
			  nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
		*attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
	}

	if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);

		if (is_mask)
			in_port = 0xffffffff; /* Always exact match in_port. */
		else if (in_port >= DP_MAX_PORTS)
			return -EINVAL;

		SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
		*attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
	} else if (!is_mask) {
		SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
	}

	if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
		uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);

		SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
		*attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
	}
	if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
		if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
					 is_mask))
			return -EINVAL;
		*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
	}
	return 0;
}

static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
				const struct nlattr **a, bool is_mask)
{
	int err;
	u64 orig_attrs = attrs;

	err = metadata_from_nlattrs(match, &attrs, a, is_mask);
	if (err)
		return err;

	if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
		const struct ovs_key_ethernet *eth_key;

		eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
		SW_FLOW_KEY_MEMCPY(match, eth.src,
				eth_key->eth_src, ETH_ALEN, is_mask);
		SW_FLOW_KEY_MEMCPY(match, eth.dst,
				eth_key->eth_dst, ETH_ALEN, is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
	}

	if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
		__be16 tci;

		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
		if (!(tci & htons(VLAN_TAG_PRESENT))) {
			if (is_mask)
				OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
			else
				OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");

			return -EINVAL;
		}

		SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
	} else if (!is_mask)
		SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);

	if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
		__be16 eth_type;

		eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
		if (is_mask) {
			/* Always exact match EtherType. */
			eth_type = htons(0xffff);
		} else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
			OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
					ntohs(eth_type), ETH_P_802_3_MIN);
			return -EINVAL;
		}

		SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
	} else if (!is_mask) {
		SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
	}

	if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
		const struct ovs_key_ipv4 *ipv4_key;

		ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
		if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
			OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
				ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
			return -EINVAL;
		}
		SW_FLOW_KEY_PUT(match, ip.proto,
				ipv4_key->ipv4_proto, is_mask);
		SW_FLOW_KEY_PUT(match, ip.tos,
				ipv4_key->ipv4_tos, is_mask);
		SW_FLOW_KEY_PUT(match, ip.ttl,
				ipv4_key->ipv4_ttl, is_mask);
		SW_FLOW_KEY_PUT(match, ip.frag,
				ipv4_key->ipv4_frag, is_mask);
		SW_FLOW_KEY_PUT(match, ipv4.addr.src,
				ipv4_key->ipv4_src, is_mask);
		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
				ipv4_key->ipv4_dst, is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
	}

	if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
		const struct ovs_key_ipv6 *ipv6_key;

		ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
		if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
			OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
				ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
			return -EINVAL;
		}
		SW_FLOW_KEY_PUT(match, ipv6.label,
				ipv6_key->ipv6_label, is_mask);
		SW_FLOW_KEY_PUT(match, ip.proto,
				ipv6_key->ipv6_proto, is_mask);
		SW_FLOW_KEY_PUT(match, ip.tos,
				ipv6_key->ipv6_tclass, is_mask);
		SW_FLOW_KEY_PUT(match, ip.ttl,
				ipv6_key->ipv6_hlimit, is_mask);
		SW_FLOW_KEY_PUT(match, ip.frag,
				ipv6_key->ipv6_frag, is_mask);
		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
				ipv6_key->ipv6_src,
				sizeof(match->key->ipv6.addr.src),
				is_mask);
		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
				ipv6_key->ipv6_dst,
				sizeof(match->key->ipv6.addr.dst),
				is_mask);

		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
	}

	if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
		const struct ovs_key_arp *arp_key;

		arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
		if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
			OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
				  arp_key->arp_op);
			return -EINVAL;
		}

		SW_FLOW_KEY_PUT(match, ipv4.addr.src,
				arp_key->arp_sip, is_mask);
		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
			arp_key->arp_tip, is_mask);
		SW_FLOW_KEY_PUT(match, ip.proto,
				ntohs(arp_key->arp_op), is_mask);
		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
				arp_key->arp_sha, ETH_ALEN, is_mask);
		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
				arp_key->arp_tha, ETH_ALEN, is_mask);

		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
	}

	if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
		const struct ovs_key_tcp *tcp_key;

		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
					tcp_key->tcp_src, is_mask);
			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
					tcp_key->tcp_dst, is_mask);
		} else {
			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
					tcp_key->tcp_src, is_mask);
			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
					tcp_key->tcp_dst, is_mask);
		}
		attrs &= ~(1 << OVS_KEY_ATTR_TCP);
	}

	if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
			SW_FLOW_KEY_PUT(match, ipv4.tp.flags,
					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
					is_mask);
		} else {
			SW_FLOW_KEY_PUT(match, ipv6.tp.flags,
					nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
					is_mask);
		}
		attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
	}

	if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
		const struct ovs_key_udp *udp_key;

		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
					udp_key->udp_src, is_mask);
			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
					udp_key->udp_dst, is_mask);
		} else {
			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
					udp_key->udp_src, is_mask);
			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
					udp_key->udp_dst, is_mask);
		}
		attrs &= ~(1 << OVS_KEY_ATTR_UDP);
	}

	if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
		const struct ovs_key_sctp *sctp_key;

		sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
					sctp_key->sctp_src, is_mask);
			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
					sctp_key->sctp_dst, is_mask);
		} else {
			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
					sctp_key->sctp_src, is_mask);
			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
					sctp_key->sctp_dst, is_mask);
		}
		attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
	}

	if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
		const struct ovs_key_icmp *icmp_key;

		icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
		SW_FLOW_KEY_PUT(match, ipv4.tp.src,
				htons(icmp_key->icmp_type), is_mask);
		SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
				htons(icmp_key->icmp_code), is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
	}

	if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
		const struct ovs_key_icmpv6 *icmpv6_key;

		icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
		SW_FLOW_KEY_PUT(match, ipv6.tp.src,
				htons(icmpv6_key->icmpv6_type), is_mask);
		SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
				htons(icmpv6_key->icmpv6_code), is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
	}

	if (attrs & (1 << OVS_KEY_ATTR_ND)) {
		const struct ovs_key_nd *nd_key;

		nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
			nd_key->nd_target,
			sizeof(match->key->ipv6.nd.target),
			is_mask);
		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
			nd_key->nd_sll, ETH_ALEN, is_mask);
		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
				nd_key->nd_tll, ETH_ALEN, is_mask);
		attrs &= ~(1 << OVS_KEY_ATTR_ND);
	}

	if (attrs != 0)
		return -EINVAL;

	return 0;
}

static void sw_flow_mask_set(struct sw_flow_mask *mask,
			     struct sw_flow_key_range *range, u8 val)
{
	u8 *m = (u8 *)&mask->key + range->start;

	mask->range = *range;
	memset(m, val, range_n_bytes(range));
}

/**
 * ovs_nla_get_match - parses Netlink attributes into a flow key and
 * mask. In case the 'mask' is NULL, the flow is treated as exact match
 * flow. Otherwise, it is treated as a wildcarded flow, except the mask
 * does not include any don't care bit.
 * @match: receives the extracted flow match information.
 * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
 * sequence. The fields should of the packet that triggered the creation
 * of this flow.
 * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
 * attribute specifies the mask field of the wildcarded flow.
 */
int ovs_nla_get_match(struct sw_flow_match *match,
		      const struct nlattr *key,
		      const struct nlattr *mask)
{
	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
	const struct nlattr *encap;
	u64 key_attrs = 0;
	u64 mask_attrs = 0;
	bool encap_valid = false;
	int err;

	err = parse_flow_nlattrs(key, a, &key_attrs);
	if (err)
		return err;

	if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
	    (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
	    (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
		__be16 tci;

		if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
		      (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
			OVS_NLERR("Invalid Vlan frame.\n");
			return -EINVAL;
		}

		key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
		encap = a[OVS_KEY_ATTR_ENCAP];
		key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
		encap_valid = true;

		if (tci & htons(VLAN_TAG_PRESENT)) {
			err = parse_flow_nlattrs(encap, a, &key_attrs);
			if (err)
				return err;
		} else if (!tci) {
			/* Corner case for truncated 802.1Q header. */
			if (nla_len(encap)) {
				OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
				return -EINVAL;
			}
		} else {
			OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
			return  -EINVAL;
		}
	}

	err = ovs_key_from_nlattrs(match, key_attrs, a, false);
	if (err)
		return err;

	if (mask) {
		err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
		if (err)
			return err;

		if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP)  {
			__be16 eth_type = 0;
			__be16 tci = 0;

			if (!encap_valid) {
				OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
				return  -EINVAL;
			}

			mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
			if (a[OVS_KEY_ATTR_ETHERTYPE])
				eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);

			if (eth_type == htons(0xffff)) {
				mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
				encap = a[OVS_KEY_ATTR_ENCAP];
				err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
			} else {
				OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
						ntohs(eth_type));
				return -EINVAL;
			}

			if (a[OVS_KEY_ATTR_VLAN])
				tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);

			if (!(tci & htons(VLAN_TAG_PRESENT))) {
				OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
				return -EINVAL;
			}
		}

		err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
		if (err)
			return err;
	} else {
		/* Populate exact match flow's key mask. */
		if (match->mask)
			sw_flow_mask_set(match->mask, &match->range, 0xff);
	}

	if (!match_validate(match, key_attrs, mask_attrs))
		return -EINVAL;

	return 0;
}

/**
 * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
 * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
 * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
 * sequence.
 *
 * This parses a series of Netlink attributes that form a flow key, which must
 * take the same form accepted by flow_from_nlattrs(), but only enough of it to
 * get the metadata, that is, the parts of the flow key that cannot be
 * extracted from the packet itself.
 */

int ovs_nla_get_flow_metadata(struct sw_flow *flow,
			      const struct nlattr *attr)
{
	struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
	u64 attrs = 0;
	int err;
	struct sw_flow_match match;

	flow->key.phy.in_port = DP_MAX_PORTS;
	flow->key.phy.priority = 0;
	flow->key.phy.skb_mark = 0;
	memset(tun_key, 0, sizeof(flow->key.tun_key));

	err = parse_flow_nlattrs(attr, a, &attrs);
	if (err)
		return -EINVAL;

	memset(&match, 0, sizeof(match));
	match.key = &flow->key;

	err = metadata_from_nlattrs(&match, &attrs, a, false);
	if (err)
		return err;

	return 0;
}

int ovs_nla_put_flow(const struct sw_flow_key *swkey,
		     const struct sw_flow_key *output, struct sk_buff *skb)
{
	struct ovs_key_ethernet *eth_key;
	struct nlattr *nla, *encap;
	bool is_mask = (swkey != output);

	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
		goto nla_put_failure;

	if ((swkey->tun_key.ipv4_dst || is_mask) &&
	    ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
		goto nla_put_failure;

	if (swkey->phy.in_port == DP_MAX_PORTS) {
		if (is_mask && (output->phy.in_port == 0xffff))
			if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
				goto nla_put_failure;
	} else {
		u16 upper_u16;
		upper_u16 = !is_mask ? 0 : 0xffff;

		if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
				(upper_u16 << 16) | output->phy.in_port))
			goto nla_put_failure;
	}

	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
		goto nla_put_failure;

	nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
	if (!nla)
		goto nla_put_failure;

	eth_key = nla_data(nla);
	memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
	memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);

	if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
		__be16 eth_type;
		eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
			goto nla_put_failure;
		encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
		if (!swkey->eth.tci)
			goto unencap;
	} else
		encap = NULL;

	if (swkey->eth.type == htons(ETH_P_802_2)) {
		/*
		 * Ethertype 802.2 is represented in the netlink with omitted
		 * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
		 * 0xffff in the mask attribute.  Ethertype can also
		 * be wildcarded.
		 */
		if (is_mask && output->eth.type)
			if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
						output->eth.type))
				goto nla_put_failure;
		goto unencap;
	}

	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
		goto nla_put_failure;

	if (swkey->eth.type == htons(ETH_P_IP)) {
		struct ovs_key_ipv4 *ipv4_key;

		nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
		if (!nla)
			goto nla_put_failure;
		ipv4_key = nla_data(nla);
		ipv4_key->ipv4_src = output->ipv4.addr.src;
		ipv4_key->ipv4_dst = output->ipv4.addr.dst;
		ipv4_key->ipv4_proto = output->ip.proto;
		ipv4_key->ipv4_tos = output->ip.tos;
		ipv4_key->ipv4_ttl = output->ip.ttl;
		ipv4_key->ipv4_frag = output->ip.frag;
	} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
		struct ovs_key_ipv6 *ipv6_key;

		nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
		if (!nla)
			goto nla_put_failure;
		ipv6_key = nla_data(nla);
		memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
				sizeof(ipv6_key->ipv6_src));
		memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
				sizeof(ipv6_key->ipv6_dst));
		ipv6_key->ipv6_label = output->ipv6.label;
		ipv6_key->ipv6_proto = output->ip.proto;
		ipv6_key->ipv6_tclass = output->ip.tos;
		ipv6_key->ipv6_hlimit = output->ip.ttl;
		ipv6_key->ipv6_frag = output->ip.frag;
	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
		   swkey->eth.type == htons(ETH_P_RARP)) {
		struct ovs_key_arp *arp_key;

		nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
		if (!nla)
			goto nla_put_failure;
		arp_key = nla_data(nla);
		memset(arp_key, 0, sizeof(struct ovs_key_arp));
		arp_key->arp_sip = output->ipv4.addr.src;
		arp_key->arp_tip = output->ipv4.addr.dst;
		arp_key->arp_op = htons(output->ip.proto);
		memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
		memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
	}

	if ((swkey->eth.type == htons(ETH_P_IP) ||
	     swkey->eth.type == htons(ETH_P_IPV6)) &&
	     swkey->ip.frag != OVS_FRAG_TYPE_LATER) {

		if (swkey->ip.proto == IPPROTO_TCP) {
			struct ovs_key_tcp *tcp_key;

			nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
			if (!nla)
				goto nla_put_failure;
			tcp_key = nla_data(nla);
			if (swkey->eth.type == htons(ETH_P_IP)) {
				tcp_key->tcp_src = output->ipv4.tp.src;
				tcp_key->tcp_dst = output->ipv4.tp.dst;
				if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
						 output->ipv4.tp.flags))
					goto nla_put_failure;
			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
				tcp_key->tcp_src = output->ipv6.tp.src;
				tcp_key->tcp_dst = output->ipv6.tp.dst;
				if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
						 output->ipv6.tp.flags))
					goto nla_put_failure;
			}
		} else if (swkey->ip.proto == IPPROTO_UDP) {
			struct ovs_key_udp *udp_key;

			nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
			if (!nla)
				goto nla_put_failure;
			udp_key = nla_data(nla);
			if (swkey->eth.type == htons(ETH_P_IP)) {
				udp_key->udp_src = output->ipv4.tp.src;
				udp_key->udp_dst = output->ipv4.tp.dst;
			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
				udp_key->udp_src = output->ipv6.tp.src;
				udp_key->udp_dst = output->ipv6.tp.dst;
			}
		} else if (swkey->ip.proto == IPPROTO_SCTP) {
			struct ovs_key_sctp *sctp_key;

			nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
			if (!nla)
				goto nla_put_failure;
			sctp_key = nla_data(nla);
			if (swkey->eth.type == htons(ETH_P_IP)) {
				sctp_key->sctp_src = swkey->ipv4.tp.src;
				sctp_key->sctp_dst = swkey->ipv4.tp.dst;
			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
				sctp_key->sctp_src = swkey->ipv6.tp.src;
				sctp_key->sctp_dst = swkey->ipv6.tp.dst;
			}
		} else if (swkey->eth.type == htons(ETH_P_IP) &&
			   swkey->ip.proto == IPPROTO_ICMP) {
			struct ovs_key_icmp *icmp_key;

			nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
			if (!nla)
				goto nla_put_failure;
			icmp_key = nla_data(nla);
			icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
			icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
		} else if (swkey->eth.type == htons(ETH_P_IPV6) &&
			   swkey->ip.proto == IPPROTO_ICMPV6) {
			struct ovs_key_icmpv6 *icmpv6_key;

			nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
						sizeof(*icmpv6_key));
			if (!nla)
				goto nla_put_failure;
			icmpv6_key = nla_data(nla);
			icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
			icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);

			if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
			    icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
				struct ovs_key_nd *nd_key;

				nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
				if (!nla)
					goto nla_put_failure;
				nd_key = nla_data(nla);
				memcpy(nd_key->nd_target, &output->ipv6.nd.target,
							sizeof(nd_key->nd_target));
				memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
				memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
			}
		}
	}

unencap:
	if (encap)
		nla_nest_end(skb, encap);

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

#define MAX_ACTIONS_BUFSIZE	(32 * 1024)

struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size)
{
	struct sw_flow_actions *sfa;

	if (size > MAX_ACTIONS_BUFSIZE)
		return ERR_PTR(-EINVAL);

	sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
	if (!sfa)
		return ERR_PTR(-ENOMEM);

	sfa->actions_len = 0;
	return sfa;
}

/* RCU callback used by ovs_nla_free_flow_actions. */
static void rcu_free_acts_callback(struct rcu_head *rcu)
{
	struct sw_flow_actions *sf_acts = container_of(rcu,
			struct sw_flow_actions, rcu);
	kfree(sf_acts);
}

/* Schedules 'sf_acts' to be freed after the next RCU grace period.
 * The caller must hold rcu_read_lock for this to be sensible. */
void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
{
	call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
}

static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
				       int attr_len)
{

	struct sw_flow_actions *acts;
	int new_acts_size;
	int req_size = NLA_ALIGN(attr_len);
	int next_offset = offsetof(struct sw_flow_actions, actions) +
					(*sfa)->actions_len;

	if (req_size <= (ksize(*sfa) - next_offset))
		goto out;

	new_acts_size = ksize(*sfa) * 2;

	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
			return ERR_PTR(-EMSGSIZE);
		new_acts_size = MAX_ACTIONS_BUFSIZE;
	}

	acts = ovs_nla_alloc_flow_actions(new_acts_size);
	if (IS_ERR(acts))
		return (void *)acts;

	memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
	acts->actions_len = (*sfa)->actions_len;
	kfree(*sfa);
	*sfa = acts;

out:
	(*sfa)->actions_len += req_size;
	return  (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
}

static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
{
	struct nlattr *a;

	a = reserve_sfa_size(sfa, nla_attr_size(len));
	if (IS_ERR(a))
		return PTR_ERR(a);

	a->nla_type = attrtype;
	a->nla_len = nla_attr_size(len);

	if (data)
		memcpy(nla_data(a), data, len);
	memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));

	return 0;
}

static inline int add_nested_action_start(struct sw_flow_actions **sfa,
					  int attrtype)
{
	int used = (*sfa)->actions_len;
	int err;

	err = add_action(sfa, attrtype, NULL, 0);
	if (err)
		return err;

	return used;
}

static inline void add_nested_action_end(struct sw_flow_actions *sfa,
					 int st_offset)
{
	struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions +
							       st_offset);

	a->nla_len = sfa->actions_len - st_offset;
}

static int validate_and_copy_sample(const struct nlattr *attr,
				    const struct sw_flow_key *key, int depth,
				    struct sw_flow_actions **sfa)
{
	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
	const struct nlattr *probability, *actions;
	const struct nlattr *a;
	int rem, start, err, st_acts;

	memset(attrs, 0, sizeof(attrs));
	nla_for_each_nested(a, attr, rem) {
		int type = nla_type(a);
		if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
			return -EINVAL;
		attrs[type] = a;
	}
	if (rem)
		return -EINVAL;

	probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
	if (!probability || nla_len(probability) != sizeof(u32))
		return -EINVAL;

	actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
	if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
		return -EINVAL;

	/* validation done, copy sample action. */
	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
	if (start < 0)
		return start;
	err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
			 nla_data(probability), sizeof(u32));
	if (err)
		return err;
	st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
	if (st_acts < 0)
		return st_acts;

	err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
	if (err)
		return err;

	add_nested_action_end(*sfa, st_acts);
	add_nested_action_end(*sfa, start);

	return 0;
}

static int validate_tp_port(const struct sw_flow_key *flow_key)
{
	if (flow_key->eth.type == htons(ETH_P_IP)) {
		if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
			return 0;
	} else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
		if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
			return 0;
	}

	return -EINVAL;
}

void ovs_match_init(struct sw_flow_match *match,
		    struct sw_flow_key *key,
		    struct sw_flow_mask *mask)
{
	memset(match, 0, sizeof(*match));
	match->key = key;
	match->mask = mask;

	memset(key, 0, sizeof(*key));

	if (mask) {
		memset(&mask->key, 0, sizeof(mask->key));
		mask->range.start = mask->range.end = 0;
	}
}

static int validate_and_copy_set_tun(const struct nlattr *attr,
				     struct sw_flow_actions **sfa)
{
	struct sw_flow_match match;
	struct sw_flow_key key;
	int err, start;

	ovs_match_init(&match, &key, NULL);
	err = ipv4_tun_from_nlattr(nla_data(attr), &match, false);
	if (err)
		return err;

	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
	if (start < 0)
		return start;

	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
			sizeof(match.key->tun_key));
	add_nested_action_end(*sfa, start);

	return err;
}

static int validate_set(const struct nlattr *a,
			const struct sw_flow_key *flow_key,
			struct sw_flow_actions **sfa,
			bool *set_tun)
{
	const struct nlattr *ovs_key = nla_data(a);
	int key_type = nla_type(ovs_key);

	/* There can be only one key in a action */
	if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
		return -EINVAL;

	if (key_type > OVS_KEY_ATTR_MAX ||
	    (ovs_key_lens[key_type] != nla_len(ovs_key) &&
	     ovs_key_lens[key_type] != -1))
		return -EINVAL;

	switch (key_type) {
	const struct ovs_key_ipv4 *ipv4_key;
	const struct ovs_key_ipv6 *ipv6_key;
	int err;

	case OVS_KEY_ATTR_PRIORITY:
	case OVS_KEY_ATTR_SKB_MARK:
	case OVS_KEY_ATTR_ETHERNET:
		break;

	case OVS_KEY_ATTR_TUNNEL:
		*set_tun = true;
		err = validate_and_copy_set_tun(a, sfa);
		if (err)
			return err;
		break;

	case OVS_KEY_ATTR_IPV4:
		if (flow_key->eth.type != htons(ETH_P_IP))
			return -EINVAL;

		if (!flow_key->ip.proto)
			return -EINVAL;

		ipv4_key = nla_data(ovs_key);
		if (ipv4_key->ipv4_proto != flow_key->ip.proto)
			return -EINVAL;

		if (ipv4_key->ipv4_frag != flow_key->ip.frag)
			return -EINVAL;

		break;

	case OVS_KEY_ATTR_IPV6:
		if (flow_key->eth.type != htons(ETH_P_IPV6))
			return -EINVAL;

		if (!flow_key->ip.proto)
			return -EINVAL;

		ipv6_key = nla_data(ovs_key);
		if (ipv6_key->ipv6_proto != flow_key->ip.proto)
			return -EINVAL;

		if (ipv6_key->ipv6_frag != flow_key->ip.frag)
			return -EINVAL;

		if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
			return -EINVAL;

		break;

	case OVS_KEY_ATTR_TCP:
		if (flow_key->ip.proto != IPPROTO_TCP)
			return -EINVAL;

		return validate_tp_port(flow_key);

	case OVS_KEY_ATTR_UDP:
		if (flow_key->ip.proto != IPPROTO_UDP)
			return -EINVAL;

		return validate_tp_port(flow_key);

	case OVS_KEY_ATTR_SCTP:
		if (flow_key->ip.proto != IPPROTO_SCTP)
			return -EINVAL;

		return validate_tp_port(flow_key);

	default:
		return -EINVAL;
	}

	return 0;
}

static int validate_userspace(const struct nlattr *attr)
{
	static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
		[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
		[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
	};
	struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
	int error;

	error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
				 attr, userspace_policy);
	if (error)
		return error;

	if (!a[OVS_USERSPACE_ATTR_PID] ||
	    !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
		return -EINVAL;

	return 0;
}

static int copy_action(const struct nlattr *from,
		       struct sw_flow_actions **sfa)
{
	int totlen = NLA_ALIGN(from->nla_len);
	struct nlattr *to;

	to = reserve_sfa_size(sfa, from->nla_len);
	if (IS_ERR(to))
		return PTR_ERR(to);

	memcpy(to, from, totlen);
	return 0;
}

int ovs_nla_copy_actions(const struct nlattr *attr,
			 const struct sw_flow_key *key,
			 int depth,
			 struct sw_flow_actions **sfa)
{
	const struct nlattr *a;
	int rem, err;

	if (depth >= SAMPLE_ACTION_DEPTH)
		return -EOVERFLOW;

	nla_for_each_nested(a, attr, rem) {
		/* Expected argument lengths, (u32)-1 for variable length. */
		static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
			[OVS_ACTION_ATTR_POP_VLAN] = 0,
			[OVS_ACTION_ATTR_SET] = (u32)-1,
			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1
		};
		const struct ovs_action_push_vlan *vlan;
		int type = nla_type(a);
		bool skip_copy;

		if (type > OVS_ACTION_ATTR_MAX ||
		    (action_lens[type] != nla_len(a) &&
		     action_lens[type] != (u32)-1))
			return -EINVAL;

		skip_copy = false;
		switch (type) {
		case OVS_ACTION_ATTR_UNSPEC:
			return -EINVAL;

		case OVS_ACTION_ATTR_USERSPACE:
			err = validate_userspace(a);
			if (err)
				return err;
			break;

		case OVS_ACTION_ATTR_OUTPUT:
			if (nla_get_u32(a) >= DP_MAX_PORTS)
				return -EINVAL;
			break;


		case OVS_ACTION_ATTR_POP_VLAN:
			break;

		case OVS_ACTION_ATTR_PUSH_VLAN:
			vlan = nla_data(a);
			if (vlan->vlan_tpid != htons(ETH_P_8021Q))
				return -EINVAL;
			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
				return -EINVAL;
			break;

		case OVS_ACTION_ATTR_SET:
			err = validate_set(a, key, sfa, &skip_copy);
			if (err)
				return err;
			break;

		case OVS_ACTION_ATTR_SAMPLE:
			err = validate_and_copy_sample(a, key, depth, sfa);
			if (err)
				return err;
			skip_copy = true;
			break;

		default:
			return -EINVAL;
		}
		if (!skip_copy) {
			err = copy_action(a, sfa);
			if (err)
				return err;
		}
	}

	if (rem > 0)
		return -EINVAL;

	return 0;
}

static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
{
	const struct nlattr *a;
	struct nlattr *start;
	int err = 0, rem;

	start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
	if (!start)
		return -EMSGSIZE;

	nla_for_each_nested(a, attr, rem) {
		int type = nla_type(a);
		struct nlattr *st_sample;

		switch (type) {
		case OVS_SAMPLE_ATTR_PROBABILITY:
			if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY,
				    sizeof(u32), nla_data(a)))
				return -EMSGSIZE;
			break;
		case OVS_SAMPLE_ATTR_ACTIONS:
			st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
			if (!st_sample)
				return -EMSGSIZE;
			err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
			if (err)
				return err;
			nla_nest_end(skb, st_sample);
			break;
		}
	}

	nla_nest_end(skb, start);
	return err;
}

static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
	const struct nlattr *ovs_key = nla_data(a);
	int key_type = nla_type(ovs_key);
	struct nlattr *start;
	int err;

	switch (key_type) {
	case OVS_KEY_ATTR_IPV4_TUNNEL:
		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
		if (!start)
			return -EMSGSIZE;

		err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
					     nla_data(ovs_key));
		if (err)
			return err;
		nla_nest_end(skb, start);
		break;
	default:
		if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
			return -EMSGSIZE;
		break;
	}

	return 0;
}

int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
{
	const struct nlattr *a;
	int rem, err;

	nla_for_each_attr(a, attr, len, rem) {
		int type = nla_type(a);

		switch (type) {
		case OVS_ACTION_ATTR_SET:
			err = set_action_to_attr(a, skb);
			if (err)
				return err;
			break;

		case OVS_ACTION_ATTR_SAMPLE:
			err = sample_action_to_attr(a, skb);
			if (err)
				return err;
			break;
		default:
			if (nla_put(skb, type, nla_len(a), nla_data(a)))
				return -EMSGSIZE;
			break;
		}
	}

	return 0;
}
l opt">, 0))) goto bp_err; /* * Validate the answer. Because there is no way to guarantee that * the entire log is made up of log records which are the same size, * we scan over the defined maximum blocks. At this point, the maximum * is not chosen to mean anything special. XXXmiken */ num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); ASSERT(num_scan_bblks <= INT_MAX); if (last_blk < num_scan_bblks) num_scan_bblks = last_blk; start_blk = last_blk - num_scan_bblks; /* * We search for any instances of cycle number 0 that occur before * our current estimate of the head. What we're trying to detect is * 1 ... | 0 | 1 | 0... * ^ binary search ends here */ if ((error = xlog_find_verify_cycle(log, start_blk, (int)num_scan_bblks, 0, &new_blk))) goto bp_err; if (new_blk != -1) last_blk = new_blk; /* * Potentially backup over partial log record write. We don't need * to search the end of the log because we know it is zero. */ if ((error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0)) == -1) { error = XFS_ERROR(EIO); goto bp_err; } else if (error) goto bp_err; *blk_no = last_blk; bp_err: xlog_put_bp(bp); if (error) return error; return -1; } /* * These are simple subroutines used by xlog_clear_stale_blocks() below * to initialize a buffer full of empty log record headers and write * them into the log. */ STATIC void xlog_add_record( struct xlog *log, xfs_caddr_t buf, int cycle, int block, int tail_cycle, int tail_block) { xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; memset(buf, 0, BBSIZE); recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); recp->h_cycle = cpu_to_be32(cycle); recp->h_version = cpu_to_be32( xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); recp->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); } STATIC int xlog_write_log_records( struct xlog *log, int cycle, int start_block, int blocks, int tail_cycle, int tail_block) { xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; /* * Greedily allocate a buffer big enough to handle the full * range of basic blocks to be written. If that fails, try * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) return ENOMEM; } /* We may need to do a read at the start to fill in part of * the buffer in the starting sector not covered by the first * write below. */ balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, bp); if (error) goto out_put_bp; j = start_block - balign; } for (i = start_block; i < end_block; i += bufblks) { int bcount, endcount; bcount = min(bufblks, end_block - start_block); endcount = bcount - j; /* We may need to do a read at the end to fill in part of * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { offset = bp->b_addr + BBTOB(ealign - start_block); error = xlog_bread_offset(log, ealign, sectbb, bp, offset); if (error) break; } offset = xlog_align(log, start_block, endcount, bp); for (; j < endcount; j++) { xlog_add_record(log, offset, cycle, i+j, tail_cycle, tail_block); offset += BBSIZE; } error = xlog_bwrite(log, start_block, endcount, bp); if (error) break; start_block += endcount; j = 0; } out_put_bp: xlog_put_bp(bp); return error; } /* * This routine is called to blow away any incomplete log writes out * in front of the log head. We do this so that we won't become confused * if we come up, write only a little bit more, and then crash again. * If we leave the partial log records out there, this situation could * cause us to think those partial writes are valid blocks since they * have the current cycle number. We get rid of them by overwriting them * with empty log records with the old cycle number rather than the * current one. * * The tail lsn is passed in rather than taken from * the log so that we will not write over the unmount record after a * clean unmount in a 512 block log. Doing so would leave the log without * any valid log records in it until a new one was written. If we crashed * during that time we would not be able to recover. */ STATIC int xlog_clear_stale_blocks( struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; int tail_block, head_block; int tail_distance, max_distance; int distance; int error; tail_cycle = CYCLE_LSN(tail_lsn); tail_block = BLOCK_LSN(tail_lsn); head_cycle = log->l_curr_cycle; head_block = log->l_curr_block; /* * Figure out the distance between the new head of the log * and the tail. We want to write over any blocks beyond the * head that we may have written just before the crash, but * we don't want to overwrite the tail of the log. */ if (head_cycle == tail_cycle) { /* * The tail is behind the head in the physical log, * so the distance from the head to the tail is the * distance from the head to the end of the log plus * the distance from the beginning of the log to the * tail. */ if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } tail_distance = tail_block + (log->l_logBBsize - head_block); } else { /* * The head is behind the tail in the physical log, * so the distance from the head to the tail is just * the tail block minus the head block. */ if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } tail_distance = tail_block - head_block; } /* * If the head is right up against the tail, we can't clear * anything. */ if (tail_distance <= 0) { ASSERT(tail_distance == 0); return 0; } max_distance = XLOG_TOTAL_REC_SHIFT(log); /* * Take the smaller of the maximum amount of outstanding I/O * we could have and the distance to the tail to clear out. * We take the smaller so that we don't overwrite the tail and * we don't waste all day writing from the head to the tail * for no reason. */ max_distance = MIN(max_distance, tail_distance); if ((head_block + max_distance) <= log->l_logBBsize) { /* * We can stomp all the blocks we need to without * wrapping around the end of the log. Just do it * in a single write. Use the cycle number of the * current cycle minus one so that the log will look like: * n ... | n - 1 ... */ error = xlog_write_log_records(log, (head_cycle - 1), head_block, max_distance, tail_cycle, tail_block); if (error) return error; } else { /* * We need to wrap around the end of the physical log in * order to clear all the blocks. Do it in two separate * I/Os. The first write should be from the head to the * end of the physical log, and it should use the current * cycle number minus one just like above. */ distance = log->l_logBBsize - head_block; error = xlog_write_log_records(log, (head_cycle - 1), head_block, distance, tail_cycle, tail_block); if (error) return error; /* * Now write the blocks at the start of the physical log. * This writes the remainder of the blocks we want to clear. * It uses the current cycle number since we're now on the * same cycle as the head so that we get: * n ... n ... | n - 1 ... * ^^^^^ blocks we're writing */ distance = max_distance - (log->l_logBBsize - head_block); error = xlog_write_log_records(log, head_cycle, 0, distance, tail_cycle, tail_block); if (error) return error; } return 0; } /****************************************************************************** * * Log recover routines * ****************************************************************************** */ STATIC xlog_recover_t * xlog_recover_find_tid( struct hlist_head *head, xlog_tid_t tid) { xlog_recover_t *trans; hlist_for_each_entry(trans, head, r_list) { if (trans->r_log_tid == tid) return trans; } return NULL; } STATIC void xlog_recover_new_tid( struct hlist_head *head, xlog_tid_t tid, xfs_lsn_t lsn) { xlog_recover_t *trans; trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); trans->r_log_tid = tid; trans->r_lsn = lsn; INIT_LIST_HEAD(&trans->r_itemq); INIT_HLIST_NODE(&trans->r_list); hlist_add_head(&trans->r_list, head); } STATIC void xlog_recover_add_item( struct list_head *head) { xlog_recover_item_t *item; item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, xfs_caddr_t dp, int len) { xlog_recover_item_t *item; xfs_caddr_t ptr, old_ptr; int old_len; if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); ptr = (xfs_caddr_t) &trans->r_theader + sizeof(xfs_trans_header_t) - len; memcpy(ptr, dp, len); /* d, s, l */ return 0; } /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } /* * The next region to add is the start of a new region. It could be * a whole region or it could be the first part of a new region. Because * of this, the assumption here is that the type and size fields of all * format structures fit into the first 32 bits of the structure. * * This works because all regions must be 32 bit aligned. Therefore, we * either have both fields or we have neither field. In the case we have * neither field, the data part of the region is zero length. We only have * a log_op_header and can throw away the header since a new one will appear * later. If we have at least 4 bytes, then we can determine how many regions * will appear in the current log item. */ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, xfs_caddr_t dp, int len) { xfs_inode_log_format_t *in_f; /* any will do */ xlog_recover_item_t *item; xfs_caddr_t ptr; if (!len) return 0; if (list_empty(&trans->r_itemq)) { /* we need to catch log corruptions here */ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { xfs_warn(log->l_mp, "%s: bad header magic number", __func__); ASSERT(0); return XFS_ERROR(EIO); } if (len == sizeof(xfs_trans_header_t)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); /* d, s, l */ return 0; } ptr = kmem_alloc(len, KM_SLEEP); memcpy(ptr, dp, len); in_f = (xfs_inode_log_format_t *)ptr; /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); if (item->ri_total != 0 && item->ri_total == item->ri_cnt) { /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); } if (item->ri_total == 0) { /* first region to be added */ if (in_f->ilf_size == 0 || in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { xfs_warn(log->l_mp, "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); return XFS_ERROR(EIO); } item->ri_total = in_f->ilf_size; item->ri_buf = kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } /* * Sort the log items in the transaction. * * The ordering constraints are defined by the inode allocation and unlink * behaviour. The rules are: * * 1. Every item is only logged once in a given transaction. Hence it * represents the last logged state of the item. Hence ordering is * dependent on the order in which operations need to be performed so * required initial conditions are always met. * * 2. Cancelled buffers are recorded in pass 1 in a separate table and * there's nothing to replay from them so we can simply cull them * from the transaction. However, we can't do that until after we've * replayed all the other items because they may be dependent on the * cancelled buffer and replaying the cancelled buffer can remove it * form the cancelled buffer table. Hence they have tobe done last. * * 3. Inode allocation buffers must be replayed before inode items that * read the buffer and replay changes into it. * * 4. Inode unlink buffers must be replayed after inode items are replayed. * This ensures that inodes are completely flushed to the inode buffer * in a "free" state before we remove the unlinked inode list pointer. * * Hence the ordering needs to be inode allocation buffers first, inode items * second, inode unlink buffers third and cancelled buffers last. * * But there's a problem with that - we can't tell an inode allocation buffer * apart from a regular buffer, so we can't separate them. We can, however, * tell an inode unlink buffer from the others, and so we can separate them out * from all the other buffers and move them to last. * * Hence, 4 lists, in order from head to tail: * - buffer_list for all buffers except cancelled/inode unlink buffers * - item_list for all non-buffer items * - inode_buffer_list for inode unlink buffers * - cancel_list for the cancelled buffers */ STATIC int xlog_recover_reorder_trans( struct xlog *log, struct xlog_recover *trans, int pass) { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); LIST_HEAD(inode_buffer_list); LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { case XFS_LI_BUF: if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); list_move(&item->ri_list, &cancel_list); break; } if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { list_move(&item->ri_list, &inode_buffer_list); break; } list_move_tail(&item->ri_list, &buffer_list); break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, "%s: unrecognized type of log operation", __func__); ASSERT(0); return XFS_ERROR(EIO); } } ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); if (!list_empty(&inode_list)) list_splice_tail(&inode_list, &trans->r_itemq); if (!list_empty(&inode_buffer_list)) list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) list_splice_tail(&cancel_list, &trans->r_itemq); return 0; } /* * Build up the table of buf cancel records so that we don't replay * cancelled data in the second pass. For buffer records that are * not cancel records, there is nothing to do here so we just return. * * If we get a cancel record which is already in the table, this indicates * that the buffer was cancelled multiple times. In order to ensure * that during pass 2 we keep the record in the table until we reach its * last occurrence in the log, we keep a reference count in the cancel * record in the table to tell us how many times we expect to see this * record during the second pass. */ STATIC int xlog_recover_buffer_pass1( struct xlog *log, struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; struct list_head *bucket; struct xfs_buf_cancel *bcp; /* * If this isn't a cancel buffer item, then just return. */ if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_buf_not_cancel(log, buf_f); return 0; } /* * Insert an xfs_buf_cancel record into the hash table of them. * If there is already an identical record, bump its reference count. */ bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); list_for_each_entry(bcp, bucket, bc_list) { if (bcp->bc_blkno == buf_f->blf_blkno && bcp->bc_len == buf_f->blf_len) { bcp->bc_refcount++; trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); return 0; } } bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); bcp->bc_blkno = buf_f->blf_blkno; bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; list_add_tail(&bcp->bc_list, bucket); trace_xfs_log_recover_buf_cancel_add(log, buf_f); return 0; } /* * Check to see whether the buffer being recovered has a corresponding * entry in the buffer cancel record table. If it does then return 1 * so that it will be cancelled, otherwise return 0. If the buffer is * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement * the refcount on the entry in the table and remove it from the table * if this is the last reference. * * We remove the cancel record from the table when we encounter its * last occurrence in the log so that if the same buffer is re-used * again after its last cancellation we actually replay the changes * made at that point. */ STATIC int xlog_check_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) { struct list_head *bucket; struct xfs_buf_cancel *bcp; if (log->l_buf_cancel_table == NULL) { /* * There is nothing in the table built in pass one, * so this buffer must not be cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } /* * Search for an entry in the cancel table that matches our buffer. */ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); list_for_each_entry(bcp, bucket, bc_list) { if (bcp->bc_blkno == blkno && bcp->bc_len == len) goto found; } /* * We didn't find a corresponding entry in the table, so return 0 so * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; found: /* * We've go a match, so return 1 so that the recovery of this buffer * is cancelled. If this buffer is actually a buffer cancel log * item, then decrement the refcount on the one in the table and * remove it if this is the last reference. */ if (flags & XFS_BLF_CANCEL) { if (--bcp->bc_refcount == 0) { list_del(&bcp->bc_list); kmem_free(bcp); } } return 1; } /* * Perform recovery for a buffer full of inodes. In these buffers, the only * data which should be recovered is that which corresponds to the * di_next_unlinked pointers in the on disk inode structures. The rest of the * data for the inodes is always logged through the inodes themselves rather * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * * The only time when buffers full of inodes are fully recovered is when the * buffer is full of newly allocated inodes. In this case the buffer will * not be marked as an inode buffer and so will be sent to * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int item_index = 0; int bit = 0; int nbits = 0; int reg_buf_offset = 0; int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); /* * Post recovery validation only works properly on CRC enabled * filesystems. */ if (xfs_sb_version_hascrc(&mp->m_sb)) bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); while (next_unlinked_offset >= (reg_buf_offset + reg_buf_bytes)) { /* * The next di_next_unlinked field is beyond * the current logged region. Find the next * logged region that contains or is beyond * the current di_next_unlinked field. */ bit += nbits; bit = xfs_next_bit(buf_f->blf_data_map, buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ if (bit == -1) return 0; nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); reg_buf_offset = bit << XFS_BLF_SHIFT; reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } /* * If the current logged region starts after the current * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ if (next_unlinked_offset < reg_buf_offset) continue; ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ logged_nextp = item->ri_buf[item_index].i_addr + next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { xfs_alert(mp, "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* * If necessary, recalculate the CRC in the on-disk inode. We * have to leave the inode in a consistent state for whoever * reads it next.... */ xfs_dinode_calc_crc(mp, (struct xfs_dinode *) xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } return 0; } /* * Validate the recovered buffer is of the correct type and attach the * appropriate buffer operations to them for writeback. Magic numbers are in a * few places: * the first 16 bits of the buffer (inode buffer, dquot buffer), * the first 32 bits of the buffer (most blocks), * inside a struct xfs_da_blkinfo at the start of the buffer. */ static void xlog_recovery_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { struct xfs_da_blkinfo *info = bp->b_addr; __uint32_t magic32; __uint16_t magic16; __uint16_t magicda; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); magic16 = be16_to_cpu(*(__be16*)bp->b_addr); magicda = be16_to_cpu(info->magic); switch (xfs_blft_from_flags(buf_f)) { case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: bp->b_ops = &xfs_allocbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; break; default: xfs_warn(mp, "Bad btree block magic!"); ASSERT(0); break; } break; case XFS_BLFT_AGF_BUF: if (magic32 != XFS_AGF_MAGIC) { xfs_warn(mp, "Bad AGF block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: if (!xfs_sb_version_hascrc(&mp->m_sb)) break; if (magic32 != XFS_AGFL_MAGIC) { xfs_warn(mp, "Bad AGFL block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_agfl_buf_ops; break; case XFS_BLFT_AGI_BUF: if (magic32 != XFS_AGI_MAGIC) { xfs_warn(mp, "Bad AGI block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_agi_buf_ops; break; case XFS_BLFT_UDQUOT_BUF: case XFS_BLFT_PDQUOT_BUF: case XFS_BLFT_GDQUOT_BUF: #ifdef CONFIG_XFS_QUOTA if (magic16 != XFS_DQUOT_MAGIC) { xfs_warn(mp, "Bad DQUOT block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_dquot_buf_ops; #else xfs_alert(mp, "Trying to recover dquots without QUOTA support built in!"); ASSERT(0); #endif break; case XFS_BLFT_DINO_BUF: /* * we get here with inode allocation buffers, not buffers that * track unlinked list changes. */ if (magic16 != XFS_DINODE_MAGIC) { xfs_warn(mp, "Bad INODE block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_inode_buf_ops; break; case XFS_BLFT_SYMLINK_BUF: if (magic32 != XFS_SYMLINK_MAGIC) { xfs_warn(mp, "Bad symlink block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_symlink_buf_ops; break; case XFS_BLFT_DIR_BLOCK_BUF: if (magic32 != XFS_DIR2_BLOCK_MAGIC && magic32 != XFS_DIR3_BLOCK_MAGIC) { xfs_warn(mp, "Bad dir block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_dir3_block_buf_ops; break; case XFS_BLFT_DIR_DATA_BUF: if (magic32 != XFS_DIR2_DATA_MAGIC && magic32 != XFS_DIR3_DATA_MAGIC) { xfs_warn(mp, "Bad dir data magic!"); ASSERT(0); break; } bp->b_ops = &xfs_dir3_data_buf_ops; break; case XFS_BLFT_DIR_FREE_BUF: if (magic32 != XFS_DIR2_FREE_MAGIC && magic32 != XFS_DIR3_FREE_MAGIC) { xfs_warn(mp, "Bad dir3 free magic!"); ASSERT(0); break; } bp->b_ops = &xfs_dir3_free_buf_ops; break; case XFS_BLFT_DIR_LEAF1_BUF: if (magicda != XFS_DIR2_LEAF1_MAGIC && magicda != XFS_DIR3_LEAF1_MAGIC) { xfs_warn(mp, "Bad dir leaf1 magic!"); ASSERT(0); break; } bp->b_ops = &xfs_dir3_leaf1_buf_ops; break; case XFS_BLFT_DIR_LEAFN_BUF: if (magicda != XFS_DIR2_LEAFN_MAGIC && magicda != XFS_DIR3_LEAFN_MAGIC) { xfs_warn(mp, "Bad dir leafn magic!"); ASSERT(0); break; } bp->b_ops = &xfs_dir3_leafn_buf_ops; break; case XFS_BLFT_DA_NODE_BUF: if (magicda != XFS_DA_NODE_MAGIC && magicda != XFS_DA3_NODE_MAGIC) { xfs_warn(mp, "Bad da node magic!"); ASSERT(0); break; } bp->b_ops = &xfs_da3_node_buf_ops; break; case XFS_BLFT_ATTR_LEAF_BUF: if (magicda != XFS_ATTR_LEAF_MAGIC && magicda != XFS_ATTR3_LEAF_MAGIC) { xfs_warn(mp, "Bad attr leaf magic!"); ASSERT(0); break; } bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: if (!xfs_sb_version_hascrc(&mp->m_sb)) break; if (magic32 != XFS_ATTR3_RMT_MAGIC) { xfs_warn(mp, "Bad attr remote magic!"); ASSERT(0); break; } bp->b_ops = &xfs_attr3_rmt_buf_ops; break; case XFS_BLFT_SB_BUF: if (magic32 != XFS_SB_MAGIC) { xfs_warn(mp, "Bad SB block magic!"); ASSERT(0); break; } bp->b_ops = &xfs_sb_buf_ops; break; default: xfs_warn(mp, "Unknown buffer type %d!", xfs_blft_from_flags(buf_f)); break; } } /* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ STATIC void xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int bit; int nbits; int error; trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { bit = xfs_next_bit(buf_f->blf_data_map, buf_f->blf_map_size, bit); if (bit == -1) break; nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(BBTOB(bp->b_io_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* * The dirty regions logged in the buffer, even though * contiguous, may span multiple chunks. This is because the * dirty region may span a physical page boundary in a buffer * and hence be split into two separate vectors for writing into * the log. Hence we need to trim nbits back to the length of * the current region being copied out of the log. */ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. */ error = 0; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); goto next; } error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) goto next; } memcpy(xfs_buf_offset(bp, (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; } /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); /* * We can only do post recovery validation on items on CRC enabled * fielsystems as we need to know when the buffer was written to be able * to determine if we should have replayed the item. If we replay old * metadata over a newer buffer, then it will enter a temporarily * inconsistent state resulting in verification failures. Hence for now * just avoid the verification stage for non-crc filesystems */ if (xfs_sb_version_hascrc(&mp->m_sb)) xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* * Do some primitive error checking on ondisk dquot data structures. */ int xfs_qm_dqcheck( struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ uint flags, char *str) { xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; int errs = 0; /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got * used for user data. This is because we take the path of regular * file deletion; however, the size field of quotainodes is never * updated, so all the tricks that we play in itruncate_finish * don't quite matter. * * 2. We don't play the quota buffers when there's a quotaoff logitem. * But the allocation will be replayed so we'll end up with an * uninitialized quota block. * * This is all fine; things are still consistent, and we haven't lost * any quota information. Just don't complain about bad dquot blks. */ if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); errs++; } if (ddq->d_version != XFS_DQUOT_VERSION) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", str, id, ddq->d_version, XFS_DQUOT_VERSION); errs++; } if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && ddq->d_flags != XFS_DQ_GROUP) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : XFS dquot ID 0x%x, unknown flags 0x%x", str, id, ddq->d_flags); errs++; } if (id != -1 && id != be32_to_cpu(ddq->d_id)) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : ondisk-dquot 0x%p, ID mismatch: " "0x%x expected, found id 0x%x", str, ddq, id, be32_to_cpu(ddq->d_id)); errs++; } if (!errs && ddq->d_id) { if (ddq->d_blk_softlimit && be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit)) { if (!ddq->d_btimer) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", str, (int)be32_to_cpu(ddq->d_id), ddq); errs++; } } if (ddq->d_ino_softlimit && be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit)) { if (!ddq->d_itimer) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", str, (int)be32_to_cpu(ddq->d_id), ddq); errs++; } } if (ddq->d_rtb_softlimit && be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit)) { if (!ddq->d_rtbtimer) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", str, (int)be32_to_cpu(ddq->d_id), ddq); errs++; } } } if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) return errs; if (flags & XFS_QMOPT_DOWARN) xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); ASSERT(flags & XFS_QMOPT_DQREPAIR); memset(d, 0, sizeof(xfs_dqblk_t)); d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_flags = type; d->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } return errs; } /* * Perform a dquot buffer recovery. * Simple algorithm: if we have found a QUOTAOFF logitem of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ STATIC void xlog_recover_do_dquot_buffer( struct xfs_mount *mp, struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) { uint type; trace_xfs_log_recover_buf_dquot_buf(log, buf_f); /* * Filesystems are required to send in quota flags at mount time. */ if (mp->m_qflags == 0) { return; } type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer */ if (log->l_quotaoffs_flag & type) return; xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } /* * This routine replays a modification made to a buffer at runtime. * There are actually two types of buffer, regular and inode, which * are handled differently. Inode buffers are handled differently * in that we only recover a specific set of data from them, namely * the inode di_next_unlinked fields. This is because all other inode * data is actually logged via inode records and any data we replay * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed * meta-data into a user's file. * * To handle the cancellation of buffer log items, we make two passes * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel * records in the table. See xlog_recover_do_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int xlog_recover_buffer_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; uint buf_flags; /* * In this pass we only want to recover all the buffers which have * not been cancelled and are not cancellation buffers themselves. */ if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, buf_f->blf_len, buf_f->blf_flags)) { trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; } trace_xfs_log_recover_buf_recover(log, buf_f); buf_flags = 0; if (buf_f->blf_flags & XFS_BLF_INODE_BUF) buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); xfs_buf_relse(bp); return error; } if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) return XFS_ERROR(error); /* * Perform delayed write on the buffer. Asynchronous writes will be * slower when taking into account all the buffers to be flushed. * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); } xfs_buf_relse(bp); return error; } STATIC int xlog_recover_inode_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; int len; xfs_caddr_t src; xfs_caddr_t dest; int error; int attr_index; uint fields; xfs_icdinode_t *dicp; uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { in_f = item->ri_buf[0].i_addr; } else { in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) goto error; } /* * Inode buffers can be freed, look out for it, * and do not replay the inode. */ if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len, 0)) { error = 0; trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; } error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); xfs_buf_relse(bp); goto error; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks * like an inode! */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; goto error; } dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; goto error; } /* Skip replay when the on disk inode is newer than the log one */ if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers */ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { xfs_buf_relse(bp); trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto error; } } /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; goto error; } } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; goto error; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; goto error; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; goto error; } isize = xfs_icdinode_size(dicp->di_version); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; goto error; } /* The core is in in-core format */ xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ if (item->ri_buf[1].i_len > isize) { memcpy((char *)dip + isize, item->ri_buf[1].i_addr + isize, item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { case XFS_ILOG_DEV: xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); break; case XFS_ILOG_UUID: memcpy(XFS_DFORK_DPTR(dip), &in_f->ilf_u.ilfu_uuid, sizeof(uuid_t)); break; } if (in_f->ilf_size == 2) goto write_inode_buffer; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || (len == in_f->ilf_dsize)); switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: case XFS_ILOG_DEXT: memcpy(XFS_DFORK_DPTR(dip), src, len); break; case XFS_ILOG_DBROOT: xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), XFS_DFORK_DSIZE(dip, mp)); break; default: /* * There are no data fork flags set. */ ASSERT((fields & XFS_ILOG_DFORK) == 0); break; } /* * If we logged any attribute data, recover it. There may or * may not have been any other non-core data logged in this * transaction. */ if (in_f->ilf_fields & XFS_ILOG_AFORK) { if (in_f->ilf_fields & XFS_ILOG_DFORK) { attr_index = 3; } else { attr_index = 2; } len = item->ri_buf[attr_index].i_len; src = item->ri_buf[attr_index].i_addr; ASSERT(len == in_f->ilf_asize); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { case XFS_ILOG_ADATA: case XFS_ILOG_AEXT: dest = XFS_DFORK_APTR(dip); ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); memcpy(dest, src, len); break; case XFS_ILOG_ABROOT: dest = XFS_DFORK_APTR(dip); xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, (xfs_bmdr_block_t*)dest, XFS_DFORK_ASIZE(dip, mp)); break; default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); xfs_buf_relse(bp); error = EIO; goto error; } } write_inode_buffer: /* re-generate the checksum. */ xfs_dinode_calc_crc(log->l_mp, dip); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); return XFS_ERROR(error); } /* * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int xlog_recover_quotaoff_pass1( struct xlog *log, struct xlog_recover_item *item) { xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* * The logitem format's flag tells us if this was user quotaoff, * group/project quotaoff or both. */ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) log->l_quotaoffs_flag |= XFS_DQ_USER; if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) log->l_quotaoffs_flag |= XFS_DQ_PROJ; if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) log->l_quotaoffs_flag |= XFS_DQ_GROUP; return (0); } /* * Recover a dquot record */ STATIC int xlog_recover_dquot_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; int error; xfs_dq_logformat_t *dq_f; uint type; /* * Filesystems are required to send in quota flags at mount time. */ if (mp->m_qflags == 0) return (0); recddq = item->ri_buf[1].i_addr; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); return XFS_ERROR(EIO); } if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); return XFS_ERROR(EIO); } /* * This type of quotas was turned off, so ignore this record. */ type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) return (0); /* * At this point we know that quota was _not_ turned off. * Since the mount flags are not indicating to us otherwise, this * must mean that quota is on, and the dquot needs to be replayed. * Remember that we may not have fully recovered the superblock yet, * so we can't do the usual trick of looking at the SB quota bits. * * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2 (log copy)"); if (error) return XFS_ERROR(EIO); ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, NULL); if (error) return error; ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); /* * At least the magic num portion should be on disk because this * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2"); if (error) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } memcpy(ddq, recddq, item->ri_buf[1].i_len); if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); return (0); } /* * This routine is called to create an in-core extent free intent * item from the efi format structure which was logged on disk. * It allocates an in-core efi, copies the extents from the format * structure into it, and adds the efi to the AIL with the given * LSN. */ STATIC int xlog_recover_efi_pass2( struct xlog *log, struct xlog_recover_item *item, xfs_lsn_t lsn) { int error; xfs_mount_t *mp = log->l_mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; efi_formatp = item->ri_buf[0].i_addr; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), &(efip->efi_format)))) { xfs_efi_item_free(efip); return error; } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); spin_lock(&log->l_ailp->xa_lock); /* * xfs_trans_ail_update() drops the AIL lock. */ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); return 0; } /* * This routine is called when an efd format structure is found in * a committed transaction in the log. It's purpose is to cancel * the corresponding efi if it was still in the log. To do this * it searches the AIL for the efi with an id equal to that in the * efd format structure. If we find it, we remove the efi from the * AIL and free it. */ STATIC int xlog_recover_efd_pass2( struct xlog *log, struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; xfs_log_item_t *lip; __uint64_t efi_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); efi_id = efd_formatp->efd_efi_id; /* * Search for the efi with the id in the efd format structure * in the AIL. */ spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* * xfs_trans_ail_delete() drops the * AIL lock. */ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; } } lip = xfs_trans_ail_cursor_next(ailp, &cur); } xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); return 0; } /* * Free up any resources allocated by the transaction * * Remember that EFIs, EFDs, and IUNLINKs are handled later. */ STATIC void xlog_recover_free_trans( struct xlog_recover *trans) { xlog_recover_item_t *item, *n; int i; list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) kmem_free(item->ri_buf[i].i_addr); /* Free the item itself */ kmem_free(item->ri_buf); kmem_free(item); } /* Free the transaction recover structure */ kmem_free(trans); } STATIC int xlog_recover_commit_pass1( struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: return xlog_recover_buffer_pass1(log, item); case XFS_LI_QUOTAOFF: return xlog_recover_quotaoff_pass1(log, item); case XFS_LI_INODE: case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_DQUOT: /* nothing to do in pass 1 */ return 0; default: xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); return XFS_ERROR(EIO); } } STATIC int xlog_recover_commit_pass2( struct xlog *log, struct xlog_recover *trans, struct list_head *buffer_list, struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: return xlog_recover_buffer_pass2(log, buffer_list, item); case XFS_LI_INODE: return xlog_recover_inode_pass2(log, buffer_list, item); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; default: xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); return XFS_ERROR(EIO); } } /* * Perform the transaction. * * If the transaction modifies a buffer or inode, do it now. Otherwise, * EFIs and EFDs get queued up by adding entries into the AIL for them. */ STATIC int xlog_recover_commit_trans( struct xlog *log, struct xlog_recover *trans, int pass) { int error = 0, error2; xlog_recover_item_t *item; LIST_HEAD (buffer_list); hlist_del(&trans->r_list); error = xlog_recover_reorder_trans(log, trans, pass); if (error) return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { switch (pass) { case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); break; case XLOG_RECOVER_PASS2: error = xlog_recover_commit_pass2(log, trans, &buffer_list, item); break; default: ASSERT(0); } if (error) goto out; } xlog_recover_free_trans(trans); out: error2 = xfs_buf_delwri_submit(&buffer_list); return error ? error : error2; } STATIC int xlog_recover_unmount_trans( struct xlog *log, struct xlog_recover *trans) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); return 0; } /* * There are two valid states of the r_state field. 0 indicates that the * transaction structure is in a normal state. We have either seen the * start of the transaction or the last operation we added was not a partial * operation. If the last operation we added to the transaction was a * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. * * NOTE: skip LRs with 0 data length. */ STATIC int xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { xfs_caddr_t lp; int num_logops; xlog_op_header_t *ohead; xlog_recover_t *trans; xlog_tid_t tid; int error; unsigned long hash; uint flags; lp = dp + be32_to_cpu(rhead->h_len); num_logops = be32_to_cpu(rhead->h_num_logops); /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) return (XFS_ERROR(EIO)); while ((dp < lp) && num_logops) { ASSERT(dp + sizeof(xlog_op_header_t) <= lp); ohead = (xlog_op_header_t *)dp; dp += sizeof(xlog_op_header_t); if (ohead->oh_clientid != XFS_TRANSACTION && ohead->oh_clientid != XFS_LOG) { xfs_warn(log->l_mp, "%s: bad clientid 0x%x", __func__, ohead->oh_clientid); ASSERT(0); return (XFS_ERROR(EIO)); } tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); trans = xlog_recover_find_tid(&rhash[hash], tid); if (trans == NULL) { /* not found; add new tid */ if (ohead->oh_flags & XLOG_START_TRANS) xlog_recover_new_tid(&rhash[hash], tid, be64_to_cpu(rhead->h_lsn)); } else { if (dp + be32_to_cpu(ohead->oh_len) > lp) { xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, be32_to_cpu(ohead->oh_len)); WARN_ON(1); return (XFS_ERROR(EIO)); } flags = ohead->oh_flags & ~XLOG_END_TRANS; if (flags & XLOG_WAS_CONT_TRANS) flags &= ~XLOG_CONTINUE_TRANS; switch (flags) { case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, trans, pass); break; case XLOG_UNMOUNT_TRANS: error = xlog_recover_unmount_trans(log, trans); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: xfs_warn(log->l_mp, "%s: bad transaction", __func__); ASSERT(0); error = XFS_ERROR(EIO); break; case 0: case XLOG_CONTINUE_TRANS: error = xlog_recover_add_to_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; default: xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); error = XFS_ERROR(EIO); break; } if (error) return error; } dp += be32_to_cpu(ohead->oh_len); num_logops--; } return 0; } /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ STATIC int xlog_recover_process_efi( xfs_mount_t *mp, xfs_efi_log_item_t *efip) { xfs_efd_log_item_t *efdp; xfs_trans_t *tp; int i; int error = 0; xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); /* * First check the validity of the extents described by the * EFI. If any are bad, then assume that all are bad and * just toss the EFI. */ for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); startblock_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, extp->ext_start)); if ((startblock_fsb == 0) || (extp->ext_len == 0) || (startblock_fsb >= mp->m_sb.sb_dblocks) || (extp->ext_len >= mp->m_sb.sb_agblocks)) { /* * This will pull the EFI from the AIL and * free the memory associated with it. */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); if (error) goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); if (error) goto abort_error; xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, extp->ext_len); } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp, 0); return error; abort_error: xfs_trans_cancel(tp, XFS_TRANS_ABORT); return error; } /* * When this is called, all of the EFIs which did not have * corresponding EFDs should be in the AIL. What we do now * is free the extents associated with each one. * * Since we process the EFIs in normal transactions, they * will be removed at some point after the commit. This prevents * us from just walking down the list processing each one. * We'll use a flag in the EFI to skip those that we've already * processed and use the AIL iteration mechanism's generation * count to try to speed this up at least a bit. * * When we start, we know that the EFIs are the only things in * the AIL. As we process them, however, other items are added * to the AIL. Since everything added to the AIL must come after * everything already in the AIL, we stop processing as soon as * we see something other than an EFI in the AIL. */ STATIC int xlog_recover_process_efis( struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* * We're done when we see something other than an EFI. * There should be no EFIs left in the AIL now. */ if (lip->li_type != XFS_LI_EFI) { #ifdef DEBUG for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) ASSERT(lip->li_type != XFS_LI_EFI); #endif break; } /* * Skip EFIs that we've already processed. */ efip = (xfs_efi_log_item_t *)lip; if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } spin_unlock(&ailp->xa_lock); error = xlog_recover_process_efi(log->l_mp, efip); spin_lock(&ailp->xa_lock); if (error) goto out; lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); return error; } /* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. */ STATIC void xlog_recover_clear_agi_bucket( xfs_mount_t *mp, xfs_agnumber_t agno, int bucket) { xfs_trans_t *tp; xfs_agi_t *agi; xfs_buf_t *agibp; int offset; int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); if (error) goto out_abort; error = xfs_read_agi(mp, tp, agno, &agibp); if (error) goto out_abort; agi = XFS_BUF_TO_AGI(agibp); agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); error = xfs_trans_commit(tp, 0); if (error) goto out_error; return; out_abort: xfs_trans_cancel(tp, XFS_TRANS_ABORT); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; } STATIC xfs_agino_t xlog_recover_process_one_iunlink( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, int bucket) { struct xfs_buf *ibp; struct xfs_dinode *dip; struct xfs_inode *ip; xfs_ino_t ino; int error; ino = XFS_AGINO_TO_INO(mp, agno, agino); error = xfs_iget(mp, NULL, ino, 0, 0, &ip); if (error) goto fail; /* * Get the on disk inode to find the next inode in the bucket. */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_mode != 0); /* setup for the next pass */ agino = be32_to_cpu(dip->di_next_unlinked); xfs_buf_relse(ibp); /* * Prevent any DMAPI event from being sent when the reference on * the inode is dropped. */ ip->i_d.di_dmevmask = 0; IRELE(ip); return agino; fail_iput: IRELE(ip); fail: /* * We can't read in the inode this bucket points to, or this inode * is messed up. Just ditch this bucket of inodes. We will lose * some inodes and space, but at least we won't hang. * * Call xlog_recover_clear_agi_bucket() to perform a transaction to * clear the inode pointer in the bucket. */ xlog_recover_clear_agi_bucket(mp, agno, bucket); return NULLAGINO; } /* * xlog_iunlink_recover * * This is called during recovery to process any inodes which * we unlinked but not freed when the system crashed. These * inodes will be on the lists in the AGI blocks. What we do * here is scan all the AGIs and fully truncate and free any * inodes found on the lists. Each inode is removed from the * lists when it has been fully truncated and is freed. The * freeing of the inode and its removal from the list must be * atomic. */ STATIC void xlog_recover_process_iunlinks( struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; xfs_agi_t *agi; xfs_buf_t *agibp; xfs_agino_t agino; int bucket; int error; uint mp_dmevmask; mp = log->l_mp; /* * Prevent any DMAPI event from being sent while in this function. */ mp_dmevmask = mp->m_dmevmask; mp->m_dmevmask = 0; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { /* * Find the agi for this ag. */ error = xfs_read_agi(mp, NULL, agno, &agibp); if (error) { /* * AGI is b0rked. Don't process it. * * We should probably mark the filesystem as corrupt * after we've recovered all the ag's we can.... */ continue; } /* * Unlock the buffer so that it can be acquired in the normal * course of the transaction to truncate and free each inode. * Because we are not racing with anyone else here for the AGI * buffer, we don't even need to hold it locked to read the * initial unlinked bucket entries out of the buffer. We keep * buffer reference though, so that it stays pinned in memory * while we need the buffer. */ agi = XFS_BUF_TO_AGI(agibp); xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); } } xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; } /* * Upack the log buffer data and crc check it. If the check fails, issue a * warning if and only if the CRC in the header is non-zero. This makes the * check an advisory warning, and the zero CRC check will prevent failure * warnings from being emitted when upgrading the kernel from one that does not * add CRCs by default. * * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log * corruption failure */ STATIC int xlog_unpack_data_crc( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.\n", le32_to_cpu(rhead->h_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } /* * If we've detected a log record corruption, then we can't * recover past this point. Abort recovery if we are enforcing * CRC protection by punting an error back up the stack. */ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) return EFSCORRUPTED; } return 0; } STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { int i, j, k; int error; error = xlog_unpack_data_crc(rhead, dp, log); if (error) return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; dp += BBSIZE; } if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; dp += BBSIZE; } } return 0; } STATIC int xlog_valid_rec_header( struct xlog *log, struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } if (unlikely( (!rhead->h_version || (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); return XFS_ERROR(EIO); } /* LR body must have data or it wouldn't have been written */ hlen = be32_to_cpu(rhead->h_len); if (unlikely( hlen <= 0 || hlen > INT_MAX )) { XFS_ERROR_REPORT("xlog_valid_rec_header(2)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { XFS_ERROR_REPORT("xlog_valid_rec_header(3)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } return 0; } /* * Read the log from tail to head and process the log records found. * Handle the two cases where the tail and head are in the same cycle * and where the active portion of the log wraps around the end of * the physical log separately. The pass parameter is passed through * to the routines called to process the data and is not looked at * here. */ STATIC int xlog_do_recovery_pass( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; xfs_caddr_t offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { /* * When using variable length iclogs, read first sector of * iclog header and extract the header size from it. Get a * new hbp that is the correct size. */ hbp = xlog_get_bp(log, 1); if (!hbp) return ENOMEM; error = xlog_bread(log, tail_blk, 1, hbp, &offset); if (error) goto bread_err1; rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; h_size = be32_to_cpu(rhead->h_size); if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; if (h_size % XLOG_HEADER_CYCLE_SIZE) hblks++; xlog_put_bp(hbp); hbp = xlog_get_bp(log, hblks); } else { hblks = 1; } } else { ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; } if (!hbp) return ENOMEM; dbp = xlog_get_bp(log, BTOBB(h_size)); if (!dbp) { xlog_put_bp(hbp); return ENOMEM; } memset(rhash, 0, sizeof(rhash)); if (tail_blk <= head_blk) { for (blk_no = tail_blk; blk_no < head_blk; ) { error = xlog_bread(log, blk_no, hblks, hbp, &offset); if (error) goto bread_err2; rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; /* blocks in data section */ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); error = xlog_bread(log, blk_no + hblks, bblks, dbp, &offset); if (error) goto bread_err2; error = xlog_unpack_data(rhead, offset, log); if (error) goto bread_err2; error = xlog_recover_process_data(log, rhash, rhead, offset, pass); if (error) goto bread_err2; blk_no += bblks + hblks; } } else { /* * Perform recovery around the end of the physical log. * When the head is not on the same cycle number as the tail, * we can't do a sequential recovery as above. */ blk_no = tail_blk; while (blk_no < log->l_logBBsize) { /* * Check for header wrapping around physical end-of-log */ offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { /* Read header in one read */ error = xlog_bread(log, blk_no, hblks, hbp, &offset); if (error) goto bread_err2; } else { /* This LR is split across physical log end */ if (blk_no != log->l_logBBsize) { /* some data before physical log end */ ASSERT(blk_no <= INT_MAX); split_hblks = log->l_logBBsize - (int)blk_no; ASSERT(split_hblks > 0); error = xlog_bread(log, blk_no, split_hblks, hbp, &offset); if (error) goto bread_err2; } /* * Note: this black magic still works with * large sector sizes (non-512) only because: * - we increased the buffer size originally * by 1 sector giving us enough extra space * for the second read; * - the log start is guaranteed to be sector * aligned; * - we read the log end (LR header start) * _first_, then the log start (LR header end) * - order is important. */ wrapped_hblks = hblks - split_hblks; error = xlog_bread_offset(log, 0, wrapped_hblks, hbp, offset + BBTOB(split_hblks)); if (error) goto bread_err2; } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, split_hblks ? blk_no : 0); if (error) goto bread_err2; bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; /* Read in data for log record */ if (blk_no + bblks <= log->l_logBBsize) { error = xlog_bread(log, blk_no, bblks, dbp, &offset); if (error) goto bread_err2; } else { /* This log record is split across the * physical end of log */ offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical * end of log */ ASSERT(!wrapped_hblks); ASSERT(blk_no <= INT_MAX); split_bblks = log->l_logBBsize - (int)blk_no; ASSERT(split_bblks > 0); error = xlog_bread(log, blk_no, split_bblks, dbp, &offset); if (error) goto bread_err2; } /* * Note: this black magic still works with * large sector sizes (non-512) only because: * - we increased the buffer size originally * by 1 sector giving us enough extra space * for the second read; * - the log start is guaranteed to be sector * aligned; * - we read the log end (LR header start) * _first_, then the log start (LR header end) * - order is important. */ error = xlog_bread_offset(log, 0, bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; } error = xlog_unpack_data(rhead, offset, log); if (error) goto bread_err2; error = xlog_recover_process_data(log, rhash, rhead, offset, pass); if (error) goto bread_err2; blk_no += bblks; } ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; /* read first part of physical log */ while (blk_no < head_blk) { error = xlog_bread(log, blk_no, hblks, hbp, &offset); if (error) goto bread_err2; rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); error = xlog_bread(log, blk_no+hblks, bblks, dbp, &offset); if (error) goto bread_err2; error = xlog_unpack_data(rhead, offset, log); if (error) goto bread_err2; error = xlog_recover_process_data(log, rhash, rhead, offset, pass); if (error) goto bread_err2; blk_no += bblks + hblks; } } bread_err2: xlog_put_bp(dbp); bread_err1: xlog_put_bp(hbp); return error; } /* * Do the recovery of the log. We actually do this in two phases. * The two passes are necessary in order to implement the function * of cancelling a record written into the log. The first pass * determines those things which have been cancelled, and the * second pass replays log items normally except for those which * have been cancelled. The handling of the replay and cancellations * takes place in the log item type specific routines. * * The table of items which have cancel records in the log is allocated * and freed at this level, since only here do we know when all of * the log recovery has been completed. */ STATIC int xlog_do_log_recovery( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { int error, i; ASSERT(head_blk != tail_blk); /* * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * sizeof(struct list_head), KM_SLEEP); for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; } /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2); #ifdef DEBUG if (!error) { int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif /* DEBUG */ kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; } /* * Do the actual recovery */ STATIC int xlog_do_recover( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { int error; xfs_buf_t *bp; xfs_sb_t *sbp; /* * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); if (error) return error; /* * If IO errors happened during recovery, bail out. */ if (XFS_FORCED_SHUTDOWN(log->l_mp)) { return (EIO); } /* * We now update the tail_lsn since much of the recovery has completed * and there may be space available to use. If there were no extent * or iunlinks, we can free up the entire log and set the tail_lsn to * be the last_sync_lsn. This was set in xlog_find_tail to be the * lsn of the last known good LR on disk. If there are extent frees * or iunlinks they will have some entries in the AIL; so we look at * the AIL to determine how to set the tail_lsn. */ xlog_assign_tail_lsn(log->l_mp); /* * Now that we've finished replaying all buffer and inode * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; } /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); /* We've re-read the superblock so re-initialize per-cpu counters */ xfs_icsb_reinit_counters(log->l_mp); xlog_recover_check_summary(log); /* Normal transactions can now occur */ log->l_flags &= ~XLOG_ACTIVE_RECOVERY; return 0; } /* * Perform recovery and re-initialize some log variables in xlog_find_tail. * * Return error or zero. */ int xlog_recover( struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; /* find the tail of the log */ if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) return error; if (tail_blk != head_blk) { /* There used to be a comment here: * * disallow recovery on read-only mounts. note -- mount * checks for ENOSPC and turns it into an intelligent * error message. * ...but this is no longer true. Now, unless you specify * NORECOVERY (in which case this function would never be * called), we just go ahead and recover. We do this all * under the vfs layer, so we can get away with it unless * the device itself is read-only, in which case we fail. */ if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { return error; } /* * Version 5 superblock log feature mask validation. We know the * log is dirty so check if there are any unknown log features * in what we need to recover. If there are unknown features * (e.g. unsupported transactions, then simply reject the * attempt at recovery before touching anything. */ if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, "Superblock has unknown incompatible log features (0x%x) enabled.\n" "The log can not be fully and/or safely recovered by this kernel.\n" "Please recover the log on a kernel that supports the unknown features.", (log->l_mp->m_sb.sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); return EINVAL; } xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); log->l_flags |= XLOG_RECOVERY_NEEDED; } return error; } /* * In the first part of recovery we replay inodes and buffers and build * up the list of extent free items which need to be processed. Here * we process the extent free items and clean up the on disk unlinked * inode lists. This is separated from the first part of recovery so * that the root and real-time bitmap inodes can be read in from disk in * between the two stages. This is necessary so that we can free space * in the real-time portion of the file system. */ int xlog_recover_finish( struct xlog *log) { /* * Now we're ready to do the transactions needed for the * rest of recovery. Start with completing all the extent * free intent records and then process the unlinked inode * lists. At this point, we essentially run in normal mode * except that we're still performing recovery actions * rather than accepting new requests. */ if (log->l_flags & XLOG_RECOVERY_NEEDED) { int error; error = xlog_recover_process_efis(log); if (error) { xfs_alert(log->l_mp, "Failed to recover EFIs"); return error; } /* * Sync the log to get all the EFIs out of the AIL. * This isn't absolutely necessary, but it helps in * case the unlink transactions would have problems * pushing the EFIs out of the way. */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { xfs_info(log->l_mp, "Ending clean mount"); } return 0; } #if defined(DEBUG) /* * Read all of the agf and agi counters and check that they * are consistent with the superblock counters. */ void xlog_recover_check_summary( struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; __uint64_t ifree; int error; mp = log->l_mp; freeblks = 0LL; itotal = 0LL; ifree = 0LL; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); if (error) { xfs_alert(mp, "%s agf read failed agno %d error %d", __func__, agno, error); } else { agfp = XFS_BUF_TO_AGF(agfbp); freeblks += be32_to_cpu(agfp->agf_freeblks) + be32_to_cpu(agfp->agf_flcount); xfs_buf_relse(agfbp); } error = xfs_read_agi(mp, NULL, agno, &agibp); if (error) { xfs_alert(mp, "%s agi read failed agno %d error %d", __func__, agno, error); } else { struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); itotal += be32_to_cpu(agi->agi_count); ifree += be32_to_cpu(agi->agi_freecount); xfs_buf_relse(agibp); } } } #endif /* DEBUG */