aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ax88796.c
blob: 62d9c9cc5671d6a7ae4bd7de66806c018e599629 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
/* drivers/net/ax88796.c
 *
 * Copyright 2005,2007 Simtec Electronics
 *	Ben Dooks <ben@simtec.co.uk>
 *
 * Asix AX88796 10/100 Ethernet controller support
 *	Based on ne.c, by Donald Becker, et-al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
*/

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/isapnp.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/platform_device.h>
#include <linux/delay.h>
#include <linux/timer.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/mii.h>
#include <linux/eeprom_93cx6.h>

#include <net/ax88796.h>

#include <asm/system.h>
#include <asm/io.h>

static int phy_debug = 0;

/* Rename the lib8390.c functions to show that they are in this driver */
#define __ei_open       ax_ei_open
#define __ei_close      ax_ei_close
#define __ei_poll	ax_ei_poll
#define __ei_start_xmit ax_ei_start_xmit
#define __ei_tx_timeout ax_ei_tx_timeout
#define __ei_get_stats  ax_ei_get_stats
#define __ei_set_multicast_list ax_ei_set_multicast_list
#define __ei_interrupt  ax_ei_interrupt
#define ____alloc_ei_netdev ax__alloc_ei_netdev
#define __NS8390_init   ax_NS8390_init

/* force unsigned long back to 'void __iomem *' */
#define ax_convert_addr(_a) ((void __force __iomem *)(_a))

#define ei_inb(_a)	readb(ax_convert_addr(_a))
#define ei_outb(_v, _a) writeb(_v, ax_convert_addr(_a))

#define ei_inb_p(_a)	ei_inb(_a)
#define ei_outb_p(_v, _a) ei_outb(_v, _a)

/* define EI_SHIFT() to take into account our register offsets */
#define EI_SHIFT(x)     (ei_local->reg_offset[(x)])

/* Ensure we have our RCR base value */
#define AX88796_PLATFORM

static unsigned char version[] = "ax88796.c: Copyright 2005,2007 Simtec Electronics\n";

#include "lib8390.c"

#define DRV_NAME "ax88796"
#define DRV_VERSION "1.00"

/* from ne.c */
#define NE_CMD		EI_SHIFT(0x00)
#define NE_RESET	EI_SHIFT(0x1f)
#define NE_DATAPORT	EI_SHIFT(0x10)

#define NE1SM_START_PG	0x20	/* First page of TX buffer */
#define NE1SM_STOP_PG 	0x40	/* Last page +1 of RX ring */
#define NESM_START_PG	0x40	/* First page of TX buffer */
#define NESM_STOP_PG	0x80	/* Last page +1 of RX ring */

/* device private data */

struct ax_device {
	struct timer_list	 mii_timer;
	spinlock_t		 mii_lock;
	struct mii_if_info	 mii;

	u32			 msg_enable;
	void __iomem		*map2;
	struct platform_device	*dev;
	struct resource		*mem;
	struct resource		*mem2;
	struct ax_plat_data	*plat;

	unsigned char		 running;
	unsigned char		 resume_open;
	unsigned int		 irqflags;

	u32			 reg_offsets[0x20];
};

static inline struct ax_device *to_ax_dev(struct net_device *dev)
{
	struct ei_device *ei_local = netdev_priv(dev);
	return (struct ax_device *)(ei_local+1);
}

/* ax_initial_check
 *
 * do an initial probe for the card to check wether it exists
 * and is functional
 */

static int ax_initial_check(struct net_device *dev)
{
	struct ei_device *ei_local = netdev_priv(dev);
	void __iomem *ioaddr = ei_local->mem;
	int reg0;
	int regd;

	reg0 = ei_inb(ioaddr);
	if (reg0 == 0xFF)
		return -ENODEV;

	ei_outb(E8390_NODMA+E8390_PAGE1+E8390_STOP, ioaddr + E8390_CMD);
	regd = ei_inb(ioaddr + 0x0d);
	ei_outb(0xff, ioaddr + 0x0d);
	ei_outb(E8390_NODMA+E8390_PAGE0, ioaddr + E8390_CMD);
	ei_inb(ioaddr + EN0_COUNTER0); /* Clear the counter by reading. */
	if (ei_inb(ioaddr + EN0_COUNTER0) != 0) {
		ei_outb(reg0, ioaddr);
		ei_outb(regd, ioaddr + 0x0d);	/* Restore the old values. */
		return -ENODEV;
	}

	return 0;
}

/* Hard reset the card.  This used to pause for the same period that a
   8390 reset command required, but that shouldn't be necessary. */

static void ax_reset_8390(struct net_device *dev)
{
	struct ei_device *ei_local = netdev_priv(dev);
	struct ax_device  *ax = to_ax_dev(dev);
	unsigned long reset_start_time = jiffies;
	void __iomem *addr = (void __iomem *)dev->base_addr;

	if (ei_debug > 1)
		dev_dbg(&ax->dev->dev, "resetting the 8390 t=%ld\n", jiffies);

	ei_outb(ei_inb(addr + NE_RESET), addr + NE_RESET);

	ei_status.txing = 0;
	ei_status.dmaing = 0;

	/* This check _should_not_ be necessary, omit eventually. */
	while ((ei_inb(addr + EN0_ISR) & ENISR_RESET) == 0) {
		if (jiffies - reset_start_time > 2*HZ/100) {
			dev_warn(&ax->dev->dev, "%s: %s did not complete.\n",
			       __func__, dev->name);
			break;
		}
	}

	ei_outb(ENISR_RESET, addr + EN0_ISR);	/* Ack intr. */
}


static void ax_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr,
			    int ring_page)
{
	struct ei_device *ei_local = netdev_priv(dev);
	struct ax_device  *ax = to_ax_dev(dev);
	void __iomem *nic_base = ei_local->mem;

	/* This *shouldn't* happen. If it does, it's the last thing you'll see */
	if (ei_status.dmaing) {
		dev_err(&ax->dev->dev, "%s: DMAing conflict in %s "
			"[DMAstat:%d][irqlock:%d].\n",
			dev->name, __func__,
			ei_status.dmaing, ei_status.irqlock);
		return;
	}

	ei_status.dmaing |= 0x01;
	ei_outb(E8390_NODMA+E8390_PAGE0+E8390_START, nic_base+ NE_CMD);
	ei_outb(sizeof(struct e8390_pkt_hdr), nic_base + EN0_RCNTLO);
	ei_outb(0, nic_base + EN0_RCNTHI);
	ei_outb(0, nic_base + EN0_RSARLO);		/* On page boundary */
	ei_outb(ring_page, nic_base + EN0_RSARHI);
	ei_outb(E8390_RREAD+E8390_START, nic_base + NE_CMD);

	if (ei_status.word16)
		readsw(nic_base + NE_DATAPORT, hdr, sizeof(struct e8390_pkt_hdr)>>1);
	else
		readsb(nic_base + NE_DATAPORT, hdr, sizeof(struct e8390_pkt_hdr));

	ei_outb(ENISR_RDC, nic_base + EN0_ISR);	/* Ack intr. */
	ei_status.dmaing &= ~0x01;

	le16_to_cpus(&hdr->count);
}


/* Block input and output, similar to the Crynwr packet driver.  If you
   are porting to a new ethercard, look at the packet driver source for hints.
   The NEx000 doesn't share the on-board packet memory -- you have to put
   the packet out through the "remote DMA" dataport using ei_outb. */

static void ax_block_input(struct net_device *dev, int count,
			   struct sk_buff *skb, int ring_offset)
{
	struct ei_device *ei_local = netdev_priv(dev);
	struct ax_device  *ax = to_ax_dev(dev);
	void __iomem *nic_base = ei_local->mem;
	char *buf = skb->data;

	if (ei_status.dmaing) {
		dev_err(&ax->dev->dev,
			"%s: DMAing conflict in %s "
			"[DMAstat:%d][irqlock:%d].\n",
			dev->name, __func__,
			ei_status.dmaing, ei_status.irqlock);
		return;
	}

	ei_status.dmaing |= 0x01;

	ei_outb(E8390_NODMA+E8390_PAGE0+E8390_START, nic_base+ NE_CMD);
	ei_outb(count & 0xff, nic_base + EN0_RCNTLO);
	ei_outb(count >> 8, nic_base + EN0_RCNTHI);
	ei_outb(ring_offset & 0xff, nic_base + EN0_RSARLO);
	ei_outb(ring_offset >> 8, nic_base + EN0_RSARHI);
	ei_outb(E8390_RREAD+E8390_START, nic_base + NE_CMD);

	if (ei_status.word16) {
		readsw(nic_base + NE_DATAPORT, buf, count >> 1);
		if (count & 0x01)
			buf[count-1] = ei_inb(nic_base + NE_DATAPORT);

	} else {
		readsb(nic_base + NE_DATAPORT, buf, count);
	}

	ei_status.dmaing &= ~1;
}

static void ax_block_output(struct net_device *dev, int count,
			    const unsigned char *buf, const int start_page)
{
	struct ei_device *ei_local = netdev_priv(dev);
	struct ax_device  *ax = to_ax_dev(dev);
	void __iomem *nic_base = ei_local->mem;
	unsigned long dma_start;

	/* Round the count up for word writes.  Do we need to do this?
	   What effect will an odd byte count have on the 8390?
	   I should check someday. */

	if (ei_status.word16 && (count & 0x01))
		count++;

	/* This *shouldn't* happen. If it does, it's the last thing you'll see */
	if (ei_status.dmaing) {
		dev_err(&ax->dev->dev, "%s: DMAing conflict in %s."
			"[DMAstat:%d][irqlock:%d]\n",
			dev->name, __func__,
		       ei_status.dmaing, ei_status.irqlock);
		return;
	}

	ei_status.dmaing |= 0x01;
	/* We should already be in page 0, but to be safe... */
	ei_outb(E8390_PAGE0+E8390_START+E8390_NODMA, nic_base + NE_CMD);

	ei_outb(ENISR_RDC, nic_base + EN0_ISR);

	/* Now the normal output. */
	ei_outb(count & 0xff, nic_base + EN0_RCNTLO);
	ei_outb(count >> 8,   nic_base + EN0_RCNTHI);
	ei_outb(0x00, nic_base + EN0_RSARLO);
	ei_outb(start_page, nic_base + EN0_RSARHI);

	ei_outb(E8390_RWRITE+E8390_START, nic_base + NE_CMD);
	if (ei_status.word16) {
		writesw(nic_base + NE_DATAPORT, buf, count>>1);
	} else {
		writesb(nic_base + NE_DATAPORT, buf, count);
	}

	dma_start = jiffies;

	while ((ei_inb(nic_base + EN0_ISR) & ENISR_RDC) == 0) {
		if (jiffies - dma_start > 2*HZ/100) {		/* 20ms */
			dev_warn(&ax->dev->dev,
				 "%s: timeout waiting for Tx RDC.\n", dev->name);
			ax_reset_8390(dev);
			ax_NS8390_init(dev,1);
			break;
		}
	}

	ei_outb(ENISR_RDC, nic_base + EN0_ISR);	/* Ack intr. */
	ei_status.dmaing &= ~0x01;
	return;
}

/* definitions for accessing MII/EEPROM interface */

#define AX_MEMR			EI_SHIFT(0x14)
#define AX_MEMR_MDC		(1<<0)
#define AX_MEMR_MDIR		(1<<1)
#define AX_MEMR_MDI		(1<<2)
#define AX_MEMR_MDO		(1<<3)
#define AX_MEMR_EECS		(1<<4)
#define AX_MEMR_EEI		(1<<5)
#define AX_MEMR_EEO		(1<<6)
#define AX_MEMR_EECLK		(1<<7)

/* ax_mii_ei_outbits
 *
 * write the specified set of bits to the phy
*/

static void
ax_mii_ei_outbits(struct net_device *dev, unsigned int bits, int len)
{
	struct ei_device *ei_local = (struct ei_device *) netdev_priv(dev);
	void __iomem *memr_addr = (void __iomem *)dev->base_addr + AX_MEMR;
	unsigned int memr;

	/* clock low, data to output mode */
	memr = ei_inb(memr_addr);
	memr &= ~(AX_MEMR_MDC | AX_MEMR_MDIR);
	ei_outb(memr, memr_addr);

	for (len--; len >= 0; len--) {
		if (bits & (1 << len))
			memr |= AX_MEMR_MDO;
		else
			memr &= ~AX_MEMR_MDO;

		ei_outb(memr, memr_addr);

		/* clock high */

		ei_outb(memr | AX_MEMR_MDC, memr_addr);
		udelay(1);

		/* clock low */
		ei_outb(memr, memr_addr);
	}

	/* leaves the clock line low, mdir input */
	memr |= AX_MEMR_MDIR;
	ei_outb(memr, (void __iomem *)dev->base_addr + AX_MEMR);
}

/* ax_phy_ei_inbits
 *
 * read a specified number of bits from the phy
*/

static unsigned int
ax_phy_ei_inbits(struct net_device *dev, int no)
{
	struct ei_device *ei_local = (struct ei_device *) netdev_priv(dev);
	void __iomem *memr_addr = (void __iomem *)dev->base_addr + AX_MEMR;
	unsigned int memr;
	unsigned int result = 0;

	/* clock low, data to input mode */
	memr = ei_inb(memr_addr);
	memr &= ~AX_MEMR_MDC;
	memr |= AX_MEMR_MDIR;
	ei_outb(memr, memr_addr);

	for (no--; no >= 0; no--) {
		ei_outb(memr | AX_MEMR_MDC, memr_addr);

		udelay(1);

		if (ei_inb(memr_addr) & AX_MEMR_MDI)
			result |= (1<<no);

		ei_outb(memr, memr_addr);
	}

	return result;
}

/* ax_phy_issueaddr
 *
 * use the low level bit shifting routines to send the address
 * and command to the specified phy
*/

static void
ax_phy_issueaddr(struct net_device *dev, int phy_addr, int reg, int opc)
{
	if (phy_debug)
		pr_debug("%s: dev %p, %04x, %04x, %d\n",
			__func__, dev, phy_addr, reg, opc);

	ax_mii_ei_outbits(dev, 0x3f, 6);	/* pre-amble */
	ax_mii_ei_outbits(dev, 1, 2);		/* frame-start */
	ax_mii_ei_outbits(dev, opc, 2);		/* op code */
	ax_mii_ei_outbits(dev, phy_addr, 5);	/* phy address */
	ax_mii_ei_outbits(dev, reg, 5);		/* reg address */
}

static int
ax_phy_read(struct net_device *dev, int phy_addr, int reg)
{
	struct ei_device *ei_local = (struct ei_device *) netdev_priv(dev);
	unsigned long flags;
 	unsigned int result;

      	spin_lock_irqsave(&ei_local->page_lock, flags);

	ax_phy_issueaddr(dev, phy_addr, reg, 2);

	result = ax_phy_ei_inbits(dev, 17);
	result &= ~(3<<16);

      	spin_unlock_irqrestore(&ei_local->page_lock, flags);

	if (phy_debug)
		pr_debug("%s: %04x.%04x => read %04x\n", __func__,
			 phy_addr, reg, result);

	return result;
}

static void
ax_phy_write(struct net_device *dev, int phy_addr, int reg, int value)
{
	struct ei_device *ei = (struct ei_device *) netdev_priv(dev);
	struct ax_device  *ax = to_ax_dev(dev);
	unsigned long flags;

	dev_dbg(&ax->dev->dev, "%s: %p, %04x, %04x %04x\n",
		__func__, dev, phy_addr, reg, value);

      	spin_lock_irqsave(&ei->page_lock, flags);

	ax_phy_issueaddr(dev, phy_addr, reg, 1);
	ax_mii_ei_outbits(dev, 2, 2);		/* send TA */
	ax_mii_ei_outbits(dev, value, 16);

      	spin_unlock_irqrestore(&ei->page_lock, flags);
}

static void ax_mii_expiry(unsigned long data)
{
	struct net_device *dev = (struct net_device *)data;
	struct ax_device  *ax = to_ax_dev(dev);
	unsigned long flags;

	spin_lock_irqsave(&ax->mii_lock, flags);
	mii_check_media(&ax->mii, netif_msg_link(ax), 0);
	spin_unlock_irqrestore(&ax->mii_lock, flags);

	if (ax->running) {
		ax->mii_timer.expires = jiffies + HZ*2;
		add_timer(&ax->mii_timer);
	}
}

static int ax_open(struct net_device *dev)
{
	struct ax_device  *ax = to_ax_dev(dev);
	struct ei_device *ei_local = netdev_priv(dev);
	int ret;

	dev_dbg(&ax->dev->dev, "%s: open\n", dev->name);

	ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
			  dev->name, dev);
	if (ret)
		return ret;

	ret = ax_ei_open(dev);
	if (ret)
		return ret;

	/* turn the phy on (if turned off) */

	ei_outb(ax->plat->gpoc_val, ei_local->mem + EI_SHIFT(0x17));
	ax->running = 1;

	/* start the MII timer */

	init_timer(&ax->mii_timer);

	ax->mii_timer.expires  = jiffies+1;
	ax->mii_timer.data     = (unsigned long) dev;
	ax->mii_timer.function = ax_mii_expiry;

	add_timer(&ax->mii_timer);

	return 0;
}

static int ax_close(struct net_device *dev)
{
	struct ax_device *ax = to_ax_dev(dev);
	struct ei_device *ei_local = netdev_priv(dev);

	dev_dbg(&ax->dev->dev, "%s: close\n", dev->name);

	/* turn the phy off */

	ei_outb(ax->plat->gpoc_val | (1<<6),
	       ei_local->mem + EI_SHIFT(0x17));

	ax->running = 0;
	wmb();

	del_timer_sync(&ax->mii_timer);
	ax_ei_close(dev);

	free_irq(dev->irq, dev);
	return 0;
}

static int ax_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
{
	struct ax_device *ax = to_ax_dev(dev);
	unsigned long flags;
	int rc;

	if (!netif_running(dev))
		return -EINVAL;

	spin_lock_irqsave(&ax->mii_lock, flags);
	rc = generic_mii_ioctl(&ax->mii, if_mii(req), cmd, NULL);
	spin_unlock_irqrestore(&ax->mii_lock, flags);

	return rc;
}

/* ethtool ops */

static void ax_get_drvinfo(struct net_device *dev,
			   struct ethtool_drvinfo *info)
{
	struct ax_device *ax = to_ax_dev(dev);

	strcpy(info->driver, DRV_NAME);
	strcpy(info->version, DRV_VERSION);
	strcpy(info->bus_info, ax->dev->name);
}

static int ax_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct ax_device *ax = to_ax_dev(dev);
	unsigned long flags;

	spin_lock_irqsave(&ax->mii_lock, flags);
	mii_ethtool_gset(&ax->mii, cmd);
	spin_unlock_irqrestore(&ax->mii_lock, flags);

	return 0;
}

static int ax_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct ax_device *ax = to_ax_dev(dev);
	unsigned long flags;
	int rc;

	spin_lock_irqsave(&ax->mii_lock, flags);
	rc = mii_ethtool_sset(&ax->mii, cmd);
	spin_unlock_irqrestore(&ax->mii_lock, flags);

	return rc;
}

static int ax_nway_reset(struct net_device *dev)
{
	struct ax_device *ax = to_ax_dev(dev);
	return mii_nway_restart(&ax->mii);
}

static u32 ax_get_link(struct net_device *dev)
{
	struct ax_device *ax = to_ax_dev(dev);
	return mii_link_ok(&ax->mii);
}

static const struct ethtool_ops ax_ethtool_ops = {
	.get_drvinfo		= ax_get_drvinfo,
	.get_settings		= ax_get_settings,
	.set_settings		= ax_set_settings,
	.nway_reset		= ax_nway_reset,
	.get_link		= ax_get_link,
};

#ifdef CONFIG_AX88796_93CX6
static void ax_eeprom_register_read(struct eeprom_93cx6 *eeprom)
{
	struct ei_device *ei_local = eeprom->data;
	u8 reg = ei_inb(ei_local->mem + AX_MEMR);

	eeprom->reg_data_in = reg & AX_MEMR_EEI;
	eeprom->reg_data_out = reg & AX_MEMR_EEO; /* Input pin */
	eeprom->reg_data_clock = reg & AX_MEMR_EECLK;
	eeprom->reg_chip_select = reg & AX_MEMR_EECS;
}

static void ax_eeprom_register_write(struct eeprom_93cx6 *eeprom)
{
	struct ei_device *ei_local = eeprom->data;
	u8 reg = ei_inb(ei_local->mem + AX_MEMR);

	reg &= ~(AX_MEMR_EEI | AX_MEMR_EECLK | AX_MEMR_EECS);

	if (eeprom->reg_data_in)
		reg |= AX_MEMR_EEI;
	if (eeprom->reg_data_clock)
		reg |= AX_MEMR_EECLK;
	if (eeprom->reg_chip_select)
		reg |= AX_MEMR_EECS;

	ei_outb(reg, ei_local->mem + AX_MEMR);
	udelay(10);
}
#endif

static const struct net_device_ops ax_netdev_ops = {
	.ndo_open		= ax_open,
	.ndo_stop		= ax_close,
	.ndo_do_ioctl		= ax_ioctl,

	.ndo_start_xmit		= ax_ei_start_xmit,
	.ndo_tx_timeout		= ax_ei_tx_timeout,
	.ndo_get_stats		= ax_ei_get_stats,
	.ndo_set_multicast_list = ax_ei_set_multicast_list,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address 	= eth_mac_addr,
	.ndo_change_mtu		= eth_change_mtu,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= ax_ei_poll,
#endif
};

/* setup code */

static void ax_initial_setup(struct net_device *dev, struct ei_device *ei_local)
{
	void __iomem *ioaddr = ei_local->mem;
	struct ax_device *ax = to_ax_dev(dev);

	/* Select page 0*/
	ei_outb(E8390_NODMA+E8390_PAGE0+E8390_STOP, ioaddr + E8390_CMD);

	/* set to byte access */
	ei_outb(ax->plat->dcr_val & ~1, ioaddr + EN0_DCFG);
	ei_outb(ax->plat->gpoc_val, ioaddr + EI_SHIFT(0x17));
}

/* ax_init_dev
 *
 * initialise the specified device, taking care to note the MAC
 * address it may already have (if configured), ensure
 * the device is ready to be used by lib8390.c and registerd with
 * the network layer.
 */

static int ax_init_dev(struct net_device *dev, int first_init)
{
	struct ei_device *ei_local = netdev_priv(dev);
	struct ax_device *ax = to_ax_dev(dev);
	void __iomem *ioaddr = ei_local->mem;
	unsigned int start_page;
	unsigned int stop_page;
	int ret;
	int i;

	ret = ax_initial_check(dev);
	if (ret)
		goto err_out;

	/* setup goes here */

	ax_initial_setup(dev, ei_local);

	/* read the mac from the card prom if we need it */

	if (first_init && ax->plat->flags & AXFLG_HAS_EEPROM) {
		unsigned char SA_prom[32];

		for(i = 0; i < sizeof(SA_prom); i+=2) {
			SA_prom[i] = ei_inb(ioaddr + NE_DATAPORT);
			SA_prom[i+1] = ei_inb(ioaddr + NE_DATAPORT);
		}

		if (ax->plat->wordlength == 2)
			for (i = 0; i < 16; i++)
				SA_prom[i] = SA_prom[i+i];

		memcpy(dev->dev_addr,  SA_prom, 6);
	}

#ifdef CONFIG_AX88796_93CX6
	if (first_init && ax->plat->flags & AXFLG_HAS_93CX6) {
		unsigned char mac_addr[6];
		struct eeprom_93cx6 eeprom;

		eeprom.data = ei_local;
		eeprom.register_read = ax_eeprom_register_read;
		eeprom.register_write = ax_eeprom_register_write;
		eeprom.width = PCI_EEPROM_WIDTH_93C56;

		eeprom_93cx6_multiread(&eeprom, 0,
				       (__le16 __force *)mac_addr,
				       sizeof(mac_addr) >> 1);

		memcpy(dev->dev_addr,  mac_addr, 6);
	}
#endif
	if (ax->plat->wordlength == 2) {
		/* We must set the 8390 for word mode. */
		ei_outb(ax->plat->dcr_val, ei_local->mem + EN0_DCFG);
		start_page = NESM_START_PG;
		stop_page = NESM_STOP_PG;
	} else {
		start_page = NE1SM_START_PG;
		stop_page = NE1SM_STOP_PG;
	}

	/* load the mac-address from the device if this is the
	 * first time we've initialised */

	if (first_init) {
		if (ax->plat->flags & AXFLG_MAC_FROMDEV) {
			ei_outb(E8390_NODMA + E8390_PAGE1 + E8390_STOP,
				ei_local->mem + E8390_CMD); /* 0x61 */
			for (i = 0; i < ETHER_ADDR_LEN; i++)
				dev->dev_addr[i] =
					ei_inb(ioaddr + EN1_PHYS_SHIFT(i));
		}

		if ((ax->plat->flags & AXFLG_MAC_FROMPLATFORM) &&
		     ax->plat->mac_addr)
			memcpy(dev->dev_addr, ax->plat->mac_addr,
				ETHER_ADDR_LEN);
	}

	ax_reset_8390(dev);

	ei_status.name = "AX88796";
	ei_status.tx_start_page = start_page;
	ei_status.stop_page = stop_page;
	ei_status.word16 = (ax->plat->wordlength == 2);
	ei_status.rx_start_page = start_page + TX_PAGES;

#ifdef PACKETBUF_MEMSIZE
	 /* Allow the packet buffer size to be overridden by know-it-alls. */
	ei_status.stop_page = ei_status.tx_start_page + PACKETBUF_MEMSIZE;
#endif

	ei_status.reset_8390	= &ax_reset_8390;
	ei_status.block_input	= &ax_block_input;
	ei_status.block_output	= &ax_block_output;
	ei_status.get_8390_hdr	= &ax_get_8390_hdr;
	ei_status.priv = 0;

	dev->netdev_ops		= &ax_netdev_ops;
	dev->ethtool_ops	= &ax_ethtool_ops;

	ax->msg_enable		= NETIF_MSG_LINK;
	ax->mii.phy_id_mask	= 0x1f;
	ax->mii.reg_num_mask	= 0x1f;
	ax->mii.phy_id		= 0x10;		/* onboard phy */
	ax->mii.force_media	= 0;
	ax->mii.full_duplex	= 0;
	ax->mii.mdio_read	= ax_phy_read;
	ax->mii.mdio_write	= ax_phy_write;
	ax->mii.dev		= dev;

	ax_NS8390_init(dev, 0);

	if (first_init)
		dev_info(&ax->dev->dev, "%dbit, irq %d, %lx, MAC: %pM\n",
			 ei_status.word16 ? 16:8, dev->irq, dev->base_addr,
			 dev->dev_addr);

	ret = register_netdev(dev);
	if (ret)
		goto out_irq;

	return 0;

 out_irq:
	/* cleanup irq */
	free_irq(dev->irq, dev);
 err_out:
	return ret;
}

static int ax_remove(struct platform_device *_dev)
{
	struct net_device *dev = platform_get_drvdata(_dev);
	struct ax_device  *ax;

	ax = to_ax_dev(dev);

	unregister_netdev(dev);
	free_irq(dev->irq, dev);

	iounmap(ei_status.mem);
	release_resource(ax->mem);
	kfree(ax->mem);

	if (ax->map2) {
		iounmap(ax->map2);
		release_resource(ax->mem2);
		kfree(ax->mem2);
	}

	free_netdev(dev);

	return 0;
}

/* ax_probe
 *
 * This is the entry point when the platform device system uses to
 * notify us of a new device to attach to. Allocate memory, find
 * the resources and information passed, and map the necessary registers.
*/

static int ax_probe(struct platform_device *pdev)
{
	struct net_device *dev;
	struct ax_device  *ax;
	struct resource   *res;
	size_t size;
	int ret = 0;

	dev = ax__alloc_ei_netdev(sizeof(struct ax_device));
	if (dev == NULL)
		return -ENOMEM;

	/* ok, let's setup our device */
	ax = to_ax_dev(dev);

	memset(ax, 0, sizeof(struct ax_device));

	spin_lock_init(&ax->mii_lock);

	ax->dev = pdev;
	ax->plat = pdev->dev.platform_data;
	platform_set_drvdata(pdev, dev);

	ei_status.rxcr_base  = ax->plat->rcr_val;

	/* find the platform resources */

	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
	if (res == NULL) {
		dev_err(&pdev->dev, "no IRQ specified\n");
		goto exit_mem;
	}

	dev->irq = res->start;
	ax->irqflags = res->flags & IRQF_TRIGGER_MASK;

	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
	if (res == NULL) {
		dev_err(&pdev->dev, "no MEM specified\n");
		ret = -ENXIO;
		goto exit_mem;
	}

	size = (res->end - res->start) + 1;

	/* setup the register offsets from either the platform data
	 * or by using the size of the resource provided */

	if (ax->plat->reg_offsets)
		ei_status.reg_offset = ax->plat->reg_offsets;
	else {
		ei_status.reg_offset = ax->reg_offsets;
		for (ret = 0; ret < 0x18; ret++)
			ax->reg_offsets[ret] = (size / 0x18) * ret;
	}

	ax->mem = request_mem_region(res->start, size, pdev->name);
	if (ax->mem == NULL) {
		dev_err(&pdev->dev, "cannot reserve registers\n");
 		ret = -ENXIO;
		goto exit_mem;
	}

	ei_status.mem = ioremap(res->start, size);
	dev->base_addr = (unsigned long)ei_status.mem;

	if (ei_status.mem == NULL) {
		dev_err(&pdev->dev, "Cannot ioremap area (%08llx,%08llx)\n",
			(unsigned long long)res->start,
			(unsigned long long)res->end);

 		ret = -ENXIO;
		goto exit_req;
	}

	/* look for reset area */

	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
	if (res == NULL) {
		if (!ax->plat->reg_offsets) {
			for (ret = 0; ret < 0x20; ret++)
				ax->reg_offsets[ret] = (size / 0x20) * ret;
		}

		ax->map2 = NULL;
	} else {
 		size = (res->end - res->start) + 1;

		ax->mem2 = request_mem_region(res->start, size, pdev->name);
		if (ax->mem == NULL) {
			dev_err(&pdev->dev, "cannot reserve registers\n");
			ret = -ENXIO;
			goto exit_mem1;
		}

		ax->map2 = ioremap(res->start, size);
		if (ax->map2 == NULL) {
			dev_err(&pdev->dev, "cannot map reset register\n");
			ret = -ENXIO;
			goto exit_mem2;
		}

		ei_status.reg_offset[0x1f] = ax->map2 - ei_status.mem;
	}

	/* got resources, now initialise and register device */

	ret = ax_init_dev(dev, 1);
	if (!ret)
		return 0;

	if (ax->map2 == NULL)
		goto exit_mem1;

	iounmap(ax->map2);

 exit_mem2:
	release_resource(ax->mem2);
	kfree(ax->mem2);

 exit_mem1:
	iounmap(ei_status.mem);

 exit_req:
	release_resource(ax->mem);
	kfree(ax->mem);

 exit_mem:
	free_netdev(dev);

	return ret;
}

/* suspend and resume */

#ifdef CONFIG_PM
static int ax_suspend(struct platform_device *dev, pm_message_t state)
{
	struct net_device *ndev = platform_get_drvdata(dev);
	struct ax_device  *ax = to_ax_dev(ndev);

	ax->resume_open = ax->running;

	netif_device_detach(ndev);
	ax_close(ndev);

	return 0;
}

static int ax_resume(struct platform_device *pdev)
{
	struct net_device *ndev = platform_get_drvdata(pdev);
	struct ax_device  *ax = to_ax_dev(ndev);

	ax_initial_setup(ndev, netdev_priv(ndev));
	ax_NS8390_init(ndev, ax->resume_open);
	netif_device_attach(ndev);

	if (ax->resume_open)
		ax_open(ndev);

	return 0;
}

#else
#define ax_suspend NULL
#define ax_resume  NULL
#endif

static struct platform_driver axdrv = {
	.driver	= {
		.name		= "ax88796",
		.owner		= THIS_MODULE,
	},
	.probe		= ax_probe,
	.remove		= ax_remove,
	.suspend	= ax_suspend,
	.resume		= ax_resume,
};

static int __init axdrv_init(void)
{
	return platform_driver_register(&axdrv);
}

static void __exit axdrv_exit(void)
{
	platform_driver_unregister(&axdrv);
}

module_init(axdrv_init);
module_exit(axdrv_exit);

MODULE_DESCRIPTION("AX88796 10/100 Ethernet platform driver");
MODULE_AUTHOR("Ben Dooks, <ben@simtec.co.uk>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("platform:ax88796");
ent. * * Called with the lockres spinlock held. */ int (*check_downconvert)(struct ocfs2_lock_res *, int); /* * Allows a lock type to populate the lock value block. This * is called on downconvert, and when we drop a lock. * * Locks that want to use this should set LOCK_TYPE_USES_LVB * in the flags field. * * Called with the lockres spinlock held. */ void (*set_lvb)(struct ocfs2_lock_res *); /* * Called from the downconvert thread when it is determined * that a lock will be downconverted. This is called without * any locks held so the function can do work that might * schedule (syncing out data, etc). * * This should return any one of the ocfs2_unblock_action * values, depending on what it wants the thread to do. */ int (*downconvert_worker)(struct ocfs2_lock_res *, int); /* * LOCK_TYPE_* flags which describe the specific requirements * of a lock type. Descriptions of each individual flag follow. */ int flags; }; /* * Some locks want to "refresh" potentially stale data when a * meaningful (PRMODE or EXMODE) lock level is first obtained. If this * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the * individual lockres l_flags member from the ast function. It is * expected that the locking wrapper will clear the * OCFS2_LOCK_NEEDS_REFRESH flag when done. */ #define LOCK_TYPE_REQUIRES_REFRESH 0x1 /* * Indicate that a lock type makes use of the lock value block. The * ->set_lvb lock type callback must be defined. */ #define LOCK_TYPE_USES_LVB 0x2 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { .get_osb = ocfs2_get_inode_osb, .flags = 0, }; static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { .get_osb = ocfs2_get_inode_osb, .check_downconvert = ocfs2_check_meta_downconvert, .set_lvb = ocfs2_set_meta_lvb, .downconvert_worker = ocfs2_data_convert_worker, .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; static struct ocfs2_lock_res_ops ocfs2_super_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH, }; static struct ocfs2_lock_res_ops ocfs2_rename_lops = { .flags = 0, }; static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, .downconvert_worker = ocfs2_dentry_convert_worker, .flags = 0, }; static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { .get_osb = ocfs2_get_inode_osb, .flags = 0, }; static struct ocfs2_lock_res_ops ocfs2_flock_lops = { .get_osb = ocfs2_get_file_osb, .flags = 0, }; static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || lockres->l_type == OCFS2_LOCK_TYPE_RW || lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) { BUG_ON(!ocfs2_is_inode_lock(lockres)); return (struct inode *) lockres->l_priv; } static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres) { BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY); return (struct ocfs2_dentry_lock *)lockres->l_priv; } static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) { if (lockres->l_ops->get_osb) return lockres->l_ops->get_osb(lockres); return (struct ocfs2_super *)lockres->l_priv; } static int ocfs2_lock_create(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, u32 dlm_flags); static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, int wanted); static void ocfs2_cluster_unlock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level); static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, int convert); #define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ _err, _func, _lockres->l_name); \ } while (0) static int ocfs2_downconvert_thread(void *arg); static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static int ocfs2_inode_lock_update(struct inode *inode, struct buffer_head **bh); static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); static inline int ocfs2_highest_compat_lock_level(int level); static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, int new_level); static int ocfs2_downconvert_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int new_level, int lvb, unsigned int generation); static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static int ocfs2_cancel_convert(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static void ocfs2_build_lock_name(enum ocfs2_lock_type type, u64 blkno, u32 generation, char *name) { int len; mlog_entry_void(); BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, (long long)blkno, generation); BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); mlog(0, "built lock resource with name: %s\n", name); mlog_exit_void(); } static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, struct ocfs2_dlm_debug *dlm_debug) { mlog(0, "Add tracking for lockres %s\n", res->l_name); spin_lock(&ocfs2_dlm_tracking_lock); list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); spin_unlock(&ocfs2_dlm_tracking_lock); } static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) { spin_lock(&ocfs2_dlm_tracking_lock); if (!list_empty(&res->l_debug_list)) list_del_init(&res->l_debug_list); spin_unlock(&ocfs2_dlm_tracking_lock); } #ifdef CONFIG_OCFS2_FS_STATS static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) { res->l_lock_num_prmode = 0; res->l_lock_num_prmode_failed = 0; res->l_lock_total_prmode = 0; res->l_lock_max_prmode = 0; res->l_lock_num_exmode = 0; res->l_lock_num_exmode_failed = 0; res->l_lock_total_exmode = 0; res->l_lock_max_exmode = 0; res->l_lock_refresh = 0; } static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, struct ocfs2_mask_waiter *mw, int ret) { unsigned long long *num, *sum; unsigned int *max, *failed; struct timespec ts = current_kernel_time(); unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start; if (level == LKM_PRMODE) { num = &res->l_lock_num_prmode; sum = &res->l_lock_total_prmode; max = &res->l_lock_max_prmode; failed = &res->l_lock_num_prmode_failed; } else if (level == LKM_EXMODE) { num = &res->l_lock_num_exmode; sum = &res->l_lock_total_exmode; max = &res->l_lock_max_exmode; failed = &res->l_lock_num_exmode_failed; } else return; (*num)++; (*sum) += time; if (time > *max) *max = time; if (ret) (*failed)++; } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) { lockres->l_lock_refresh++; } static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { struct timespec ts = current_kernel_time(); mw->mw_lock_start = timespec_to_ns(&ts); } #else static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) { } static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, struct ocfs2_mask_waiter *mw, int ret) { } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) { } static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { } #endif static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, struct ocfs2_lock_res *res, enum ocfs2_lock_type type, struct ocfs2_lock_res_ops *ops, void *priv) { res->l_type = type; res->l_ops = ops; res->l_priv = priv; res->l_level = DLM_LOCK_IV; res->l_requested = DLM_LOCK_IV; res->l_blocking = DLM_LOCK_IV; res->l_action = OCFS2_AST_INVALID; res->l_unlock_action = OCFS2_UNLOCK_INVALID; res->l_flags = OCFS2_LOCK_INITIALIZED; ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); ocfs2_init_lock_stats(res); } void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) { /* This also clears out the lock status block */ memset(res, 0, sizeof(struct ocfs2_lock_res)); spin_lock_init(&res->l_lock); init_waitqueue_head(&res->l_event); INIT_LIST_HEAD(&res->l_blocked_list); INIT_LIST_HEAD(&res->l_mask_waiters); } void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, enum ocfs2_lock_type type, unsigned int generation, struct inode *inode) { struct ocfs2_lock_res_ops *ops; switch(type) { case OCFS2_LOCK_TYPE_RW: ops = &ocfs2_inode_rw_lops; break; case OCFS2_LOCK_TYPE_META: ops = &ocfs2_inode_inode_lops; break; case OCFS2_LOCK_TYPE_OPEN: ops = &ocfs2_inode_open_lops; break; default: mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ break; }; ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, generation, res->l_name); ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode); } static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) { struct inode *inode = ocfs2_lock_res_inode(lockres); return OCFS2_SB(inode->i_sb); } static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) { struct ocfs2_file_private *fp = lockres->l_priv; return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb); } static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) { __be64 inode_blkno_be; memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], sizeof(__be64)); return be64_to_cpu(inode_blkno_be); } static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres) { struct ocfs2_dentry_lock *dl = lockres->l_priv; return OCFS2_SB(dl->dl_inode->i_sb); } void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, u64 parent, struct inode *inode) { int len; u64 inode_blkno = OCFS2_I(inode)->ip_blkno; __be64 inode_blkno_be = cpu_to_be64(inode_blkno); struct ocfs2_lock_res *lockres = &dl->dl_lockres; ocfs2_lock_res_init_once(lockres); /* * Unfortunately, the standard lock naming scheme won't work * here because we have two 16 byte values to use. Instead, * we'll stuff the inode number as a binary value. We still * want error prints to show something without garbling the * display, so drop a null byte in there before the inode * number. A future version of OCFS2 will likely use all * binary lock names. The stringified names have been a * tremendous aid in debugging, but now that the debugfs * interface exists, we can mangle things there if need be. * * NOTE: We also drop the standard "pad" value (the total lock * name size stays the same though - the last part is all * zeros due to the memset in ocfs2_lock_res_init_once() */ len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START, "%c%016llx", ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY), (long long)parent); BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1)); memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be, sizeof(__be64)); ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops, dl); } static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { /* Superblock lockres doesn't come from a slab so we call init * once on it manually. */ ocfs2_lock_res_init_once(res); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO, 0, res->l_name); ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, &ocfs2_super_lops, osb); } static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { /* Rename lockres doesn't come from a slab so we call init * once on it manually. */ ocfs2_lock_res_init_once(res); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name); ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, &ocfs2_rename_lops, osb); } void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp) { struct inode *inode = fp->fp_file->f_mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); ocfs2_lock_res_init_once(lockres); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno, inode->i_generation, lockres->l_name); ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops, fp); lockres->l_flags |= OCFS2_LOCK_NOCACHE; } void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) return; ocfs2_remove_lockres_tracking(res); mlog_bug_on_msg(!list_empty(&res->l_blocked_list), "Lockres %s is on the blocked list\n", res->l_name); mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), "Lockres %s has mask waiters pending\n", res->l_name); mlog_bug_on_msg(spin_is_locked(&res->l_lock), "Lockres %s is locked\n", res->l_name); mlog_bug_on_msg(res->l_ro_holders, "Lockres %s has %u ro holders\n", res->l_name, res->l_ro_holders); mlog_bug_on_msg(res->l_ex_holders, "Lockres %s has %u ex holders\n", res->l_name, res->l_ex_holders); /* Need to clear out the lock status block for the dlm */ memset(&res->l_lksb, 0, sizeof(res->l_lksb)); res->l_flags = 0UL; mlog_exit_void(); } static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { mlog_entry_void(); BUG_ON(!lockres); switch(level) { case DLM_LOCK_EX: lockres->l_ex_holders++; break; case DLM_LOCK_PR: lockres->l_ro_holders++; break; default: BUG(); } mlog_exit_void(); } static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, int level) { mlog_entry_void(); BUG_ON(!lockres); switch(level) { case DLM_LOCK_EX: BUG_ON(!lockres->l_ex_holders); lockres->l_ex_holders--; break; case DLM_LOCK_PR: BUG_ON(!lockres->l_ro_holders); lockres->l_ro_holders--; break; default: BUG(); } mlog_exit_void(); } /* WARNING: This function lives in a world where the only three lock * levels are EX, PR, and NL. It *will* have to be adjusted when more * lock types are added. */ static inline int ocfs2_highest_compat_lock_level(int level) { int new_level = DLM_LOCK_EX; if (level == DLM_LOCK_EX) new_level = DLM_LOCK_NL; else if (level == DLM_LOCK_PR) new_level = DLM_LOCK_PR; return new_level; } static void lockres_set_flags(struct ocfs2_lock_res *lockres, unsigned long newflags) { struct ocfs2_mask_waiter *mw, *tmp; assert_spin_locked(&lockres->l_lock); lockres->l_flags = newflags; list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) continue; list_del_init(&mw->mw_item); mw->mw_status = 0; complete(&mw->mw_complete); } } static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) { lockres_set_flags(lockres, lockres->l_flags | or); } static void lockres_clear_flags(struct ocfs2_lock_res *lockres, unsigned long clear) { lockres_set_flags(lockres, lockres->l_flags & ~clear); } static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) { mlog_entry_void(); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); lockres->l_level = lockres->l_requested; if (lockres->l_level <= ocfs2_highest_compat_lock_level(lockres->l_blocking)) { lockres->l_blocking = DLM_LOCK_NL; lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); } lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); mlog_exit_void(); } static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) { mlog_entry_void(); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); /* Convert from RO to EX doesn't really need anything as our * information is already up to data. Convert from NL to * *anything* however should mark ourselves as needing an * update */ if (lockres->l_level == DLM_LOCK_NL && lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); lockres->l_level = lockres->l_requested; lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); mlog_exit_void(); } static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) { mlog_entry_void(); BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); if (lockres->l_requested > DLM_LOCK_NL && !(lockres->l_flags & OCFS2_LOCK_LOCAL) && lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); lockres->l_level = lockres->l_requested; lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); mlog_exit_void(); } static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level) { int needs_downconvert = 0; mlog_entry_void(); assert_spin_locked(&lockres->l_lock); lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); if (level > lockres->l_blocking) { /* only schedule a downconvert if we haven't already scheduled * one that goes low enough to satisfy the level we're * blocking. this also catches the case where we get * duplicate BASTs */ if (ocfs2_highest_compat_lock_level(level) < ocfs2_highest_compat_lock_level(lockres->l_blocking)) needs_downconvert = 1; lockres->l_blocking = level; } mlog_exit(needs_downconvert); return needs_downconvert; } /* * OCFS2_LOCK_PENDING and l_pending_gen. * * Why does OCFS2_LOCK_PENDING exist? To close a race between setting * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock() * for more details on the race. * * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock() * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns, * the caller is going to try to clear PENDING again. If nothing else is * happening, __lockres_clear_pending() sees PENDING is unset and does * nothing. * * But what if another path (eg downconvert thread) has just started a * new locking action? The other path has re-set PENDING. Our path * cannot clear PENDING, because that will re-open the original race * window. * * [Example] * * ocfs2_meta_lock() * ocfs2_cluster_lock() * set BUSY * set PENDING * drop l_lock * ocfs2_dlm_lock() * ocfs2_locking_ast() ocfs2_downconvert_thread() * clear PENDING ocfs2_unblock_lock() * take_l_lock * !BUSY * ocfs2_prepare_downconvert() * set BUSY * set PENDING * drop l_lock * take l_lock * clear PENDING * drop l_lock * <window> * ocfs2_dlm_lock() * * So as you can see, we now have a window where l_lock is not held, * PENDING is not set, and ocfs2_dlm_lock() has not been called. * * The core problem is that ocfs2_cluster_lock() has cleared the PENDING * set by ocfs2_prepare_downconvert(). That wasn't nice. * * To solve this we introduce l_pending_gen. A call to * lockres_clear_pending() will only do so when it is passed a generation * number that matches the lockres. lockres_set_pending() will return the * current generation number. When ocfs2_cluster_lock() goes to clear * PENDING, it passes the generation it got from set_pending(). In our * example above, the generation numbers will *not* match. Thus, * ocfs2_cluster_lock() will not clear the PENDING set by * ocfs2_prepare_downconvert(). */ /* Unlocked version for ocfs2_locking_ast() */ static void __lockres_clear_pending(struct ocfs2_lock_res *lockres, unsigned int generation, struct ocfs2_super *osb) { assert_spin_locked(&lockres->l_lock); /* * The ast and locking functions can race us here. The winner * will clear pending, the loser will not. */ if (!(lockres->l_flags & OCFS2_LOCK_PENDING) || (lockres->l_pending_gen != generation)) return; lockres_clear_flags(lockres, OCFS2_LOCK_PENDING); lockres->l_pending_gen++; /* * The downconvert thread may have skipped us because we * were PENDING. Wake it up. */ if (lockres->l_flags & OCFS2_LOCK_BLOCKED) ocfs2_wake_downconvert_thread(osb); } /* Locked version for callers of ocfs2_dlm_lock() */ static void lockres_clear_pending(struct ocfs2_lock_res *lockres, unsigned int generation, struct ocfs2_super *osb) { unsigned long flags; spin_lock_irqsave(&lockres->l_lock, flags); __lockres_clear_pending(lockres, generation, osb); spin_unlock_irqrestore(&lockres->l_lock, flags); } static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) { assert_spin_locked(&lockres->l_lock); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); lockres_or_flags(lockres, OCFS2_LOCK_PENDING); return lockres->l_pending_gen; } static void ocfs2_blocking_ast(void *opaque, int level) { struct ocfs2_lock_res *lockres = opaque; struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); int needs_downconvert; unsigned long flags; BUG_ON(level <= DLM_LOCK_NL); mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", lockres->l_name, level, lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); /* * We can skip the bast for locks which don't enable caching - * they'll be dropped at the earliest possible time anyway. */ if (lockres->l_flags & OCFS2_LOCK_NOCACHE) return; spin_lock_irqsave(&lockres->l_lock, flags); needs_downconvert = ocfs2_generic_handle_bast(lockres, level); if (needs_downconvert) ocfs2_schedule_blocked_lock(osb, lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); wake_up(&lockres->l_event); ocfs2_wake_downconvert_thread(osb); } static void ocfs2_locking_ast(void *opaque) { struct ocfs2_lock_res *lockres = opaque; struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); unsigned long flags; int status; spin_lock_irqsave(&lockres->l_lock, flags); status = ocfs2_dlm_lock_status(&lockres->l_lksb); if (status == -EAGAIN) { lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); goto out; } if (status) { mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n", lockres->l_name, status); spin_unlock_irqrestore(&lockres->l_lock, flags); return; } switch(lockres->l_action) { case OCFS2_AST_ATTACH: ocfs2_generic_handle_attach_action(lockres); lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); break; case OCFS2_AST_CONVERT: ocfs2_generic_handle_convert_action(lockres); break; case OCFS2_AST_DOWNCONVERT: ocfs2_generic_handle_downconvert_action(lockres); break; default: mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " "lockres flags = 0x%lx, unlock action: %u\n", lockres->l_name, lockres->l_action, lockres->l_flags, lockres->l_unlock_action); BUG(); } out: /* set it to something invalid so if we get called again we * can catch it. */ lockres->l_action = OCFS2_AST_INVALID; /* Did we try to cancel this lock? Clear that state */ if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; /* * We may have beaten the locking functions here. We certainly * know that dlm_lock() has been called :-) * Because we can't have two lock calls in flight at once, we * can use lockres->l_pending_gen. */ __lockres_clear_pending(lockres, lockres->l_pending_gen, osb); wake_up(&lockres->l_event); spin_unlock_irqrestore(&lockres->l_lock, flags); } static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, int convert) { unsigned long flags; mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); if (convert) lockres->l_action = OCFS2_AST_INVALID; else lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; spin_unlock_irqrestore(&lockres->l_lock, flags); wake_up(&lockres->l_event); mlog_exit_void(); } /* Note: If we detect another process working on the lock (i.e., * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller * to do the right thing in that case. */ static int ocfs2_lock_create(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, u32 dlm_flags) { int ret = 0; unsigned long flags; unsigned int gen; mlog_entry_void(); mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, dlm_flags); spin_lock_irqsave(&lockres->l_lock, flags); if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || (lockres->l_flags & OCFS2_LOCK_BUSY)) { spin_unlock_irqrestore(&lockres->l_lock, flags); goto bail; } lockres->l_action = OCFS2_AST_ATTACH; lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); gen = lockres_set_pending(lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, dlm_flags, lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, lockres); lockres_clear_pending(lockres, gen, osb); if (ret) { ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); ocfs2_recover_from_dlm_error(lockres, 1); } mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); bail: mlog_exit(ret); return ret; } static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, int flag) { unsigned long flags; int ret; spin_lock_irqsave(&lockres->l_lock, flags); ret = lockres->l_flags & flag; spin_unlock_irqrestore(&lockres->l_lock, flags); return ret; } static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) { wait_event(lockres->l_event, !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); } static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) { wait_event(lockres->l_event, !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); } /* predict what lock level we'll be dropping down to on behalf * of another node, and return true if the currently wanted * level will be compatible with it. */ static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, int wanted) { BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); } static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) { INIT_LIST_HEAD(&mw->mw_item); init_completion(&mw->mw_complete); ocfs2_init_start_time(mw); } static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) { wait_for_completion(&mw->mw_complete); /* Re-arm the completion in case we want to wait on it again */ INIT_COMPLETION(mw->mw_complete); return mw->mw_status; } static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, struct ocfs2_mask_waiter *mw, unsigned long mask, unsigned long goal) { BUG_ON(!list_empty(&mw->mw_item)); assert_spin_locked(&lockres->l_lock); list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); mw->mw_mask = mask; mw->mw_goal = goal; } /* returns 0 if the mw that was removed was already satisfied, -EBUSY * if the mask still hadn't reached its goal */ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, struct ocfs2_mask_waiter *mw) { unsigned long flags; int ret = 0; spin_lock_irqsave(&lockres->l_lock, flags); if (!list_empty(&mw->mw_item)) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) ret = -EBUSY; list_del_init(&mw->mw_item); init_completion(&mw->mw_complete); } spin_unlock_irqrestore(&lockres->l_lock, flags); return ret; } static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, struct ocfs2_lock_res *lockres) { int ret; ret = wait_for_completion_interruptible(&mw->mw_complete); if (ret) lockres_remove_mask_waiter(lockres, mw); else ret = mw->mw_status; /* Re-arm the completion in case we want to wait on it again */ INIT_COMPLETION(mw->mw_complete); return ret; } static int ocfs2_cluster_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, u32 lkm_flags, int arg_flags) { struct ocfs2_mask_waiter mw; int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ unsigned long flags; unsigned int gen; int noqueue_attempted = 0; mlog_entry_void(); ocfs2_init_mask_waiter(&mw); if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lkm_flags |= DLM_LKF_VALBLK; again: wait = 0; if (catch_signals && signal_pending(current)) { ret = -ERESTARTSYS; goto out; } spin_lock_irqsave(&lockres->l_lock, flags); mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, "Cluster lock called on freeing lockres %s! flags " "0x%lx\n", lockres->l_name, lockres->l_flags); /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, * we'll get caught below. */ if (lockres->l_flags & OCFS2_LOCK_BUSY && level > lockres->l_level) { /* is someone sitting in dlm_lock? If so, wait on * them. */ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); wait = 1; goto unlock; } if (lockres->l_flags & OCFS2_LOCK_BLOCKED && !ocfs2_may_continue_on_blocked_lock(lockres, level)) { /* is the lock is currently blocked on behalf of * another node */ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); wait = 1; goto unlock; } if (level > lockres->l_level) { if (noqueue_attempted > 0) { ret = -EAGAIN; goto unlock; } if (lkm_flags & DLM_LKF_NOQUEUE) noqueue_attempted = 1; if (lockres->l_action != OCFS2_AST_INVALID) mlog(ML_ERROR, "lockres %s has action %u pending\n", lockres->l_name, lockres->l_action); if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { lockres->l_action = OCFS2_AST_ATTACH; lkm_flags &= ~DLM_LKF_CONVERT; } else { lockres->l_action = OCFS2_AST_CONVERT; lkm_flags |= DLM_LKF_CONVERT; } lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); gen = lockres_set_pending(lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); BUG_ON(level == DLM_LOCK_IV); BUG_ON(level == DLM_LOCK_NL); mlog(0, "lock %s, convert from %d to level = %d\n", lockres->l_name, lockres->l_level, level); /* call dlm_lock to upgrade lock now */ ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, lockres); lockres_clear_pending(lockres, gen, osb); if (ret) { if (!(lkm_flags & DLM_LKF_NOQUEUE) || (ret != -EAGAIN)) { ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); } ocfs2_recover_from_dlm_error(lockres, 1); goto out; } mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", lockres->l_name); /* At this point we've gone inside the dlm and need to * complete our work regardless. */ catch_signals = 0; /* wait for busy to clear and carry on */ goto again; } /* Ok, if we get here then we're good to go. */ ocfs2_inc_holders(lockres, level); ret = 0; unlock: spin_unlock_irqrestore(&lockres->l_lock, flags); out: /* * This is helping work around a lock inversion between the page lock * and dlm locks. One path holds the page lock while calling aops * which block acquiring dlm locks. The voting thread holds dlm * locks while acquiring page locks while down converting data locks. * This block is helping an aop path notice the inversion and back * off to unlock its page lock before trying the dlm lock again. */ if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { wait = 0; if (lockres_remove_mask_waiter(lockres, &mw)) ret = -EAGAIN; else goto again; } if (wait) { ret = ocfs2_wait_for_mask(&mw); if (ret == 0) goto again; mlog_errno(ret); } ocfs2_update_lock_stats(lockres, level, &mw, ret); mlog_exit(ret); return ret; } static void ocfs2_cluster_unlock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level) { unsigned long flags; mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); ocfs2_dec_holders(lockres, level); ocfs2_downconvert_on_unlock(osb, lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); mlog_exit_void(); } static int ocfs2_create_new_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int ex, int local) { int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; unsigned long flags; u32 lkm_flags = local ? DLM_LKF_LOCAL : 0; spin_lock_irqsave(&lockres->l_lock, flags); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); spin_unlock_irqrestore(&lockres->l_lock, flags); return ocfs2_lock_create(osb, lockres, level, lkm_flags); } /* Grants us an EX lock on the data and metadata resources, skipping * the normal cluster directory lookup. Use this ONLY on newly created * inodes which other nodes can't possibly see, and which haven't been * hashed in the inode hash yet. This can give us a good performance * increase as it'll skip the network broadcast normally associated * with creating a new lock resource. */ int ocfs2_create_new_inode_locks(struct inode *inode) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); BUG_ON(!ocfs2_inode_is_new(inode)); mlog_entry_void(); mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); /* NOTE: That we don't increment any of the holder counts, nor * do we add anything to a journal handle. Since this is * supposed to be a new inode which the cluster doesn't know * about yet, there is no need to. As far as the LVB handling * is concerned, this is basically like acquiring an EX lock * on a resource which has an invalid one -- we'll set it * valid when we release the EX. */ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1); if (ret) { mlog_errno(ret); goto bail; } /* * We don't want to use DLM_LKF_LOCAL on a meta data lock as they * don't use a generation in their lock names. */ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); if (ret) { mlog_errno(ret); goto bail; } ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); if (ret) { mlog_errno(ret); goto bail; } bail: mlog_exit(ret); return ret; } int ocfs2_rw_lock(struct inode *inode, int write) { int status, level; struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); mlog_entry_void(); mlog(0, "inode %llu take %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); if (ocfs2_mount_local(osb)) return 0; lockres = &OCFS2_I(inode)->ip_rw_lockres; level = write ? DLM_LOCK_EX : DLM_LOCK_PR; status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 0); if (status < 0) mlog_errno(status); mlog_exit(status); return status; } void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); mlog(0, "inode %llu drop %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); if (!ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); } /* * ocfs2_open_lock always get PR mode lock. */ int ocfs2_open_lock(struct inode *inode) { int status = 0; struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); mlog_entry_void(); mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); if (ocfs2_mount_local(osb)) goto out; lockres = &OCFS2_I(inode)->ip_open_lockres; status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, DLM_LOCK_PR, 0, 0); if (status < 0) mlog_errno(status); out: mlog_exit(status); return status; } int ocfs2_try_open_lock(struct inode *inode, int write) { int status = 0, level; struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); mlog_entry_void(); mlog(0, "inode %llu try to take %s open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); if (ocfs2_mount_local(osb)) goto out; lockres = &OCFS2_I(inode)->ip_open_lockres; level = write ? DLM_LOCK_EX : DLM_LOCK_PR; /* * The file system may already holding a PRMODE/EXMODE open lock. * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on * other nodes and the -EAGAIN will indicate to the caller that * this inode is still in use. */ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, DLM_LKF_NOQUEUE, 0); out: mlog_exit(status); return status; } /* * ocfs2_open_unlock unlock PR and EX mode open locks. */ void ocfs2_open_unlock(struct inode *inode) { struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); mlog(0, "inode %llu drop open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); if (ocfs2_mount_local(osb)) goto out; if(lockres->l_ro_holders) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, DLM_LOCK_PR); if(lockres->l_ex_holders) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, DLM_LOCK_EX); out: mlog_exit_void(); } static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres, int level) { int ret; struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); unsigned long flags; struct ocfs2_mask_waiter mw; ocfs2_init_mask_waiter(&mw); retry_cancel: spin_lock_irqsave(&lockres->l_lock, flags); if (lockres->l_flags & OCFS2_LOCK_BUSY) { ret = ocfs2_prepare_cancel_convert(osb, lockres); if (ret) { spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_cancel_convert(osb, lockres); if (ret < 0) { mlog_errno(ret); goto out; } goto retry_cancel; } lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); ocfs2_wait_for_mask(&mw); goto retry_cancel; } ret = -ERESTARTSYS; /* * We may still have gotten the lock, in which case there's no * point to restarting the syscall. */ if (lockres->l_level == level) ret = 0; mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret, lockres->l_flags, lockres->l_level, lockres->l_action); spin_unlock_irqrestore(&lockres->l_lock, flags); out: return ret; } /* * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of * flock() calls. The locking approach this requires is sufficiently * different from all other cluster lock types that we implement a * seperate path to the "low-level" dlm calls. In particular: * * - No optimization of lock levels is done - we take at exactly * what's been requested. * * - No lock caching is employed. We immediately downconvert to * no-lock at unlock time. This also means flock locks never go on * the blocking list). * * - Since userspace can trivially deadlock itself with flock, we make * sure to allow cancellation of a misbehaving applications flock() * request. * * - Access to any flock lockres doesn't require concurrency, so we * can simplify the code by requiring the caller to guarantee * serialization of dlmglue flock calls. */ int ocfs2_file_lock(struct file *file, int ex, int trylock) { int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0; unsigned long flags; struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); struct ocfs2_mask_waiter mw; ocfs2_init_mask_waiter(&mw); if ((lockres->l_flags & OCFS2_LOCK_BUSY) || (lockres->l_level > DLM_LOCK_NL)) { mlog(ML_ERROR, "File lock \"%s\" has busy or locked state: flags: 0x%lx, " "level: %u\n", lockres->l_name, lockres->l_flags, lockres->l_level); return -EINVAL; } spin_lock_irqsave(&lockres->l_lock, flags); if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); /* * Get the lock at NLMODE to start - that way we * can cancel the upconvert request if need be. */ ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0); if (ret < 0) { mlog_errno(ret); goto out; } ret = ocfs2_wait_for_mask(&mw); if (ret) { mlog_errno(ret); goto out; } spin_lock_irqsave(&lockres->l_lock, flags); } lockres->l_action = OCFS2_AST_CONVERT; lkm_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, lockres); if (ret) { if (!trylock || (ret != -EAGAIN)) { ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); ret = -EINVAL; } ocfs2_recover_from_dlm_error(lockres, 1); lockres_remove_mask_waiter(lockres, &mw); goto out; } ret = ocfs2_wait_for_mask_interruptible(&mw, lockres); if (ret == -ERESTARTSYS) { /* * Userspace can cause deadlock itself with * flock(). Current behavior locally is to allow the * deadlock, but abort the system call if a signal is * received. We follow this example, otherwise a * poorly written program could sit in kernel until * reboot. * * Handling this is a bit more complicated for Ocfs2 * though. We can't exit this function with an * outstanding lock request, so a cancel convert is * required. We intentionally overwrite 'ret' - if the * cancel fails and the lock was granted, it's easier * to just bubble sucess back up to the user. */ ret = ocfs2_flock_handle_signal(lockres, level); } else if (!ret && (level > lockres->l_level)) { /* Trylock failed asynchronously */ BUG_ON(!trylock); ret = -EAGAIN; } out: mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n", lockres->l_name, ex, trylock, ret); return ret; } void ocfs2_file_unlock(struct file *file) { int ret; unsigned int gen; unsigned long flags; struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); struct ocfs2_mask_waiter mw; ocfs2_init_mask_waiter(&mw); if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) return; if (lockres->l_level == DLM_LOCK_NL) return; mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", lockres->l_name, lockres->l_flags, lockres->l_level, lockres->l_action); spin_lock_irqsave(&lockres->l_lock, flags); /* * Fake a blocking ast for the downconvert code. */ lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); lockres->l_blocking = DLM_LOCK_EX; gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL); lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen); if (ret) { mlog_errno(ret); return; } ret = ocfs2_wait_for_mask(&mw); if (ret) mlog_errno(ret); } static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { int kick = 0; mlog_entry_void(); /* If we know that another node is waiting on our lock, kick * the downconvert thread * pre-emptively when we reach a release * condition. */ if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { switch(lockres->l_blocking) { case DLM_LOCK_EX: if (!lockres->l_ex_holders && !lockres->l_ro_holders) kick = 1; break; case DLM_LOCK_PR: if (!lockres->l_ex_holders) kick = 1; break; default: BUG(); } } if (kick) ocfs2_wake_downconvert_thread(osb); mlog_exit_void(); } #define OCFS2_SEC_BITS 34 #define OCFS2_SEC_SHIFT (64 - 34) #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) /* LVB only has room for 64 bits of time here so we pack it for * now. */ static u64 ocfs2_pack_timespec(struct timespec *spec) { u64 res; u64 sec = spec->tv_sec; u32 nsec = spec->tv_nsec; res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); return res; } /* Call this with the lockres locked. I am reasonably sure we don't * need ip_lock in this function as anyone who would be changing those * values is supposed to be blocked in ocfs2_inode_lock right now. */ static void __ocfs2_stuff_meta_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; mlog_entry_void(); lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); /* * Invalidate the LVB of a deleted inode - this way other * nodes are forced to go to disk and discover the new inode * status. */ if (oi->ip_flags & OCFS2_INODE_DELETED) { lvb->lvb_version = 0; goto out; } lvb->lvb_version = OCFS2_LVB_VERSION; lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); lvb->lvb_iuid = cpu_to_be32(inode->i_uid); lvb->lvb_igid = cpu_to_be32(inode->i_gid); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); out: mlog_meta_lvb(0, lockres); mlog_exit_void(); } static void ocfs2_unpack_timespec(struct timespec *spec, u64 packed_time) { spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; } static void ocfs2_refresh_inode_from_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; mlog_entry_void(); mlog_meta_lvb(0, lockres); lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); oi->ip_attr = be32_to_cpu(lvb->lvb_iattr); oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures); ocfs2_set_inode_flags(inode); /* fast-symlinks are a special case */ if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); inode->i_mode = be16_to_cpu(lvb->lvb_imode); inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); ocfs2_unpack_timespec(&inode->i_atime, be64_to_cpu(lvb->lvb_iatime_packed)); ocfs2_unpack_timespec(&inode->i_mtime, be64_to_cpu(lvb->lvb_imtime_packed)); ocfs2_unpack_timespec(&inode->i_ctime, be64_to_cpu(lvb->lvb_ictime_packed)); spin_unlock(&oi->ip_lock); mlog_exit_void(); } static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, struct ocfs2_lock_res *lockres) { struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); if (lvb->lvb_version == OCFS2_LVB_VERSION && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) return 1; return 0; } /* Determine whether a lock resource needs to be refreshed, and * arbitrate who gets to refresh it. * * 0 means no refresh needed. * * > 0 means you need to refresh this and you MUST call * ocfs2_complete_lock_res_refresh afterwards. */ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) { unsigned long flags; int status = 0; mlog_entry_void(); refresh_check: spin_lock_irqsave(&lockres->l_lock, flags); if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { spin_unlock_irqrestore(&lockres->l_lock, flags); goto bail; } if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { spin_unlock_irqrestore(&lockres->l_lock, flags); ocfs2_wait_on_refreshing_lock(lockres); goto refresh_check; } /* Ok, I'll be the one to refresh this lock. */ lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); spin_unlock_irqrestore(&lockres->l_lock, flags); status = 1; bail: mlog_exit(status); return status; } /* If status is non zero, I'll mark it as not being in refresh * anymroe, but i won't clear the needs refresh flag. */ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,