aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/media/video/v4l2-compat-ioctl32.c
blob: d0e1bd3ace6aec3a8f2aeab76a79509e783f762f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
/*
 * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
 *	Separated from fs stuff by Arnd Bergmann <arnd@arndb.de>
 *
 * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
 * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
 * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs
 * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
 * Copyright (C) 2005       Philippe De Muyter (phdm@macqel.be)
 * Copyright (C) 2008       Hans Verkuil <hverkuil@xs4all.nl>
 *
 * These routines maintain argument size conversion between 32bit and 64bit
 * ioctls.
 */

#include <linux/compat.h>
#define __OLD_VIDIOC_ /* To allow fixing old calls*/
#include <linux/videodev.h>
#include <linux/videodev2.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
#include <media/v4l2-ioctl.h>

#ifdef CONFIG_COMPAT

#ifdef CONFIG_VIDEO_V4L1_COMPAT
struct video_tuner32 {
	compat_int_t tuner;
	char name[32];
	compat_ulong_t rangelow, rangehigh;
	u32 flags;	/* It is really u32 in videodev.h */
	u16 mode, signal;
};

static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
{
	if (!access_ok(VERIFY_READ, up, sizeof(struct video_tuner32)) ||
		get_user(kp->tuner, &up->tuner) ||
		copy_from_user(kp->name, up->name, 32) ||
		get_user(kp->rangelow, &up->rangelow) ||
		get_user(kp->rangehigh, &up->rangehigh) ||
		get_user(kp->flags, &up->flags) ||
		get_user(kp->mode, &up->mode) ||
		get_user(kp->signal, &up->signal))
		return -EFAULT;
	return 0;
}

static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
{
	if (!access_ok(VERIFY_WRITE, up, sizeof(struct video_tuner32)) ||
		put_user(kp->tuner, &up->tuner) ||
		copy_to_user(up->name, kp->name, 32) ||
		put_user(kp->rangelow, &up->rangelow) ||
		put_user(kp->rangehigh, &up->rangehigh) ||
		put_user(kp->flags, &up->flags) ||
		put_user(kp->mode, &up->mode) ||
		put_user(kp->signal, &up->signal))
			return -EFAULT;
	return 0;
}

struct video_buffer32 {
	compat_caddr_t base;
	compat_int_t height, width, depth, bytesperline;
};

static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
{
	u32 tmp;

	if (!access_ok(VERIFY_READ, up, sizeof(struct video_buffer32)) ||
		get_user(tmp, &up->base) ||
		get_user(kp->height, &up->height) ||
		get_user(kp->width, &up->width) ||
		get_user(kp->depth, &up->depth) ||
		get_user(kp->bytesperline, &up->bytesperline))
			return -EFAULT;

	/* This is actually a physical address stored
	 * as a void pointer.
	 */
	kp->base = (void *)(unsigned long) tmp;

	return 0;
}

static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
{
	u32 tmp = (u32)((unsigned long)kp->base);

	if (!access_ok(VERIFY_WRITE, up, sizeof(struct video_buffer32)) ||
		put_user(tmp, &up->base) ||
		put_user(kp->height, &up->height) ||
		put_user(kp->width, &up->width) ||
		put_user(kp->depth, &up->depth) ||
		put_user(kp->bytesperline, &up->bytesperline))
			return -EFAULT;
	return 0;
}

struct video_clip32 {
	s32 x, y, width, height;	/* It's really s32 in videodev.h */
	compat_caddr_t next;
};

struct video_window32 {
	u32 x, y, width, height, chromakey, flags;
	compat_caddr_t clips;
	compat_int_t clipcount;
};

static int get_video_window32(struct video_window *kp, struct video_window32 __user *up)
{
	struct video_clip __user *uclips;
	struct video_clip __user *kclips;
	compat_caddr_t p;
	int nclips;

	if (!access_ok(VERIFY_READ, up, sizeof(struct video_window32)))
		return -EFAULT;

	if (get_user(nclips, &up->clipcount))
		return -EFAULT;

	if (!access_ok(VERIFY_READ, up, sizeof(struct video_window32)) ||
	    get_user(kp->x, &up->x) ||
	    get_user(kp->y, &up->y) ||
	    get_user(kp->width, &up->width) ||
	    get_user(kp->height, &up->height) ||
	    get_user(kp->chromakey, &up->chromakey) ||
	    get_user(kp->flags, &up->flags) ||
	    get_user(kp->clipcount, &up->clipcount))
		return -EFAULT;

	nclips = kp->clipcount;
	kp->clips = NULL;

	if (nclips == 0)
		return 0;
	if (get_user(p, &up->clips))
		return -EFAULT;
	uclips = compat_ptr(p);

	/* If nclips < 0, then it is a clipping bitmap of size
	   VIDEO_CLIPMAP_SIZE */
	if (nclips < 0) {
		if (!access_ok(VERIFY_READ, uclips, VIDEO_CLIPMAP_SIZE))
			return -EFAULT;
		kp->clips = compat_alloc_user_space(VIDEO_CLIPMAP_SIZE);
		if (copy_in_user(kp->clips, uclips, VIDEO_CLIPMAP_SIZE))
			return -EFAULT;
		return 0;
	}

	/* Otherwise it is an array of video_clip structs. */
	if (!access_ok(VERIFY_READ, uclips, nclips * sizeof(struct video_clip)))
		return -EFAULT;

	kp->clips = compat_alloc_user_space(nclips * sizeof(struct video_clip));
	kclips = kp->clips;
	while (nclips--) {
		int err;

		err = copy_in_user(&kclips->x, &uclips->x, sizeof(kclips->x));
		err |= copy_in_user(&kclips->y, &uclips->y, sizeof(kclips->y));
		err |= copy_in_user(&kclips->width, &uclips->width, sizeof(kclips->width));
		err |= copy_in_user(&kclips->height, &uclips->height, sizeof(kclips->height));
		kclips->next = NULL;
		if (err)
			return -EFAULT;
		kclips++;
		uclips++;
	}
	return 0;
}

/* You get back everything except the clips... */
static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
{
	if (!access_ok(VERIFY_WRITE, up, sizeof(struct video_window32)) ||
		put_user(kp->x, &up->x) ||
		put_user(kp->y, &up->y) ||
		put_user(kp->width, &up->width) ||
		put_user(kp->height, &up->height) ||
		put_user(kp->chromakey, &up->chromakey) ||
		put_user(kp->flags, &up->flags) ||
		put_user(kp->clipcount, &up->clipcount))
			return -EFAULT;
	return 0;
}

struct video_code32 {
	char		loadwhat[16];	/* name or tag of file being passed */
	compat_int_t	datasize;
	unsigned char	*data;
};

static int get_microcode32(struct video_code *kp, struct video_code32 __user *up)
{
	if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) ||
		copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) ||
		get_user(kp->datasize, &up->datasize) ||
		copy_from_user(kp->data, up->data, up->datasize))
			return -EFAULT;
	return 0;
}

#define VIDIOCGTUNER32		_IOWR('v', 4, struct video_tuner32)
#define VIDIOCSTUNER32		_IOW('v', 5, struct video_tuner32)
#define VIDIOCGWIN32		_IOR('v', 9, struct video_window32)
#define VIDIOCSWIN32		_IOW('v', 10, struct video_window32)
#define VIDIOCGFBUF32		_IOR('v', 11, struct video_buffer32)
#define VIDIOCSFBUF32		_IOW('v', 12, struct video_buffer32)
#define VIDIOCGFREQ32		_IOR('v', 14, u32)
#define VIDIOCSFREQ32		_IOW('v', 15, u32)
#define VIDIOCSMICROCODE32	_IOW('v', 27, struct video_code32)

#define VIDIOCCAPTURE32		_IOW('v', 8, s32)
#define VIDIOCSYNC32		_IOW('v', 18, s32)
#define VIDIOCSWRITEMODE32	_IOW('v', 25, s32)

#endif

static int native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	int ret = -ENOIOCTLCMD;

	if (file->f_op->unlocked_ioctl)
		ret = file->f_op->unlocked_ioctl(file, cmd, arg);
	else if (file->f_op->ioctl) {
		lock_kernel();
		ret = file->f_op->ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
		unlock_kernel();
	}

	return ret;
}


struct v4l2_clip32 {
	struct v4l2_rect        c;
	compat_caddr_t 		next;
};

struct v4l2_window32 {
	struct v4l2_rect        w;
	enum v4l2_field  	field;
	__u32			chromakey;
	compat_caddr_t		clips; /* actually struct v4l2_clip32 * */
	__u32			clipcount;
	compat_caddr_t		bitmap;
};

static int get_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user *up)
{
	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_window32)) ||
		copy_from_user(&kp->w, &up->w, sizeof(up->w)) ||
		get_user(kp->field, &up->field) ||
		get_user(kp->chromakey, &up->chromakey) ||
		get_user(kp->clipcount, &up->clipcount))
			return -EFAULT;
	if (kp->clipcount > 2048)
		return -EINVAL;
	if (kp->clipcount) {
		struct v4l2_clip32 __user *uclips;
		struct v4l2_clip __user *kclips;
		int n = kp->clipcount;
		compat_caddr_t p;

		if (get_user(p, &up->clips))
			return -EFAULT;
		uclips = compat_ptr(p);
		kclips = compat_alloc_user_space(n * sizeof(struct v4l2_clip));
		kp->clips = kclips;
		while (--n >= 0) {
			if (copy_in_user(&kclips->c, &uclips->c, sizeof(uclips->c)))
				return -EFAULT;
			if (put_user(n ? kclips + 1 : NULL, &kclips->next))
				return -EFAULT;
			uclips += 1;
			kclips += 1;
		}
	} else
		kp->clips = NULL;
	return 0;
}

static int put_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user *up)
{
	if (copy_to_user(&up->w, &kp->w, sizeof(up->w)) ||
		put_user(kp->field, &up->field) ||
		put_user(kp->chromakey, &up->chromakey) ||
		put_user(kp->clipcount, &up->clipcount))
			return -EFAULT;
	return 0;
}

static inline int get_v4l2_pix_format(struct v4l2_pix_format *kp, struct v4l2_pix_format __user *up)
{
	if (copy_from_user(kp, up, sizeof(struct v4l2_pix_format)))
		return -EFAULT;
	return 0;
}

static inline int put_v4l2_pix_format(struct v4l2_pix_format *kp, struct v4l2_pix_format __user *up)
{
	if (copy_to_user(up, kp, sizeof(struct v4l2_pix_format)))
		return -EFAULT;
	return 0;
}

static inline int get_v4l2_vbi_format(struct v4l2_vbi_format *kp, struct v4l2_vbi_format __user *up)
{
	if (copy_from_user(kp, up, sizeof(struct v4l2_vbi_format)))
		return -EFAULT;
	return 0;
}

static inline int put_v4l2_vbi_format(struct v4l2_vbi_format *kp, struct v4l2_vbi_format __user *up)
{
	if (copy_to_user(up, kp, sizeof(struct v4l2_vbi_format)))
		return -EFAULT;
	return 0;
}

static inline int get_v4l2_sliced_vbi_format(struct v4l2_sliced_vbi_format *kp, struct v4l2_sliced_vbi_format __user *up)
{
	if (copy_from_user(kp, up, sizeof(struct v4l2_sliced_vbi_format)))
		return -EFAULT;
	return 0;
}

static inline int put_v4l2_sliced_vbi_format(struct v4l2_sliced_vbi_format *kp, struct v4l2_sliced_vbi_format __user *up)
{
	if (copy_to_user(up, kp, sizeof(struct v4l2_sliced_vbi_format)))
		return -EFAULT;
	return 0;
}

struct v4l2_format32 {
	enum v4l2_buf_type type;
	union {
		struct v4l2_pix_format	pix;
		struct v4l2_window32	win;
		struct v4l2_vbi_format	vbi;
		struct v4l2_sliced_vbi_format	sliced;
		__u8	raw_data[200];        /* user-defined */
	} fmt;
};

static int get_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __user *up)
{
	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_format32)) ||
			get_user(kp->type, &up->type))
			return -EFAULT;
	switch (kp->type) {
	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
		return get_v4l2_pix_format(&kp->fmt.pix, &up->fmt.pix);
	case V4L2_BUF_TYPE_VIDEO_OVERLAY:
	case V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY:
		return get_v4l2_window32(&kp->fmt.win, &up->fmt.win);
	case V4L2_BUF_TYPE_VBI_CAPTURE:
	case V4L2_BUF_TYPE_VBI_OUTPUT:
		return get_v4l2_vbi_format(&kp->fmt.vbi, &up->fmt.vbi);
	case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
	case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
		return get_v4l2_sliced_vbi_format(&kp->fmt.sliced, &up->fmt.sliced);
	case V4L2_BUF_TYPE_PRIVATE:
		if (copy_from_user(kp, up, sizeof(kp->fmt.raw_data)))
			return -EFAULT;
		return 0;
	case 0:
		return -EINVAL;
	default:
		printk(KERN_INFO "compat_ioctl32: unexpected VIDIOC_FMT type %d\n",
								kp->type);
		return -EINVAL;
	}
}

static int put_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __user *up)
{
	if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_format32)) ||
		put_user(kp->type, &up->type))
		return -EFAULT;
	switch (kp->type) {
	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
		return put_v4l2_pix_format(&kp->fmt.pix, &up->fmt.pix);
	case V4L2_BUF_TYPE_VIDEO_OVERLAY:
	case V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY:
		return put_v4l2_window32(&kp->fmt.win, &up->fmt.win);
	case V4L2_BUF_TYPE_VBI_CAPTURE:
	case V4L2_BUF_TYPE_VBI_OUTPUT:
		return put_v4l2_vbi_format(&kp->fmt.vbi, &up->fmt.vbi);
	case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
	case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
		return put_v4l2_sliced_vbi_format(&kp->fmt.sliced, &up->fmt.sliced);
	case V4L2_BUF_TYPE_PRIVATE:
		if (copy_to_user(up, kp, sizeof(up->fmt.raw_data)))
			return -EFAULT;
		return 0;
	case 0:
		return -EINVAL;
	default:
		printk(KERN_INFO "compat_ioctl32: unexpected VIDIOC_FMT type %d\n",
								kp->type);
		return -EINVAL;
	}
}

struct v4l2_standard32 {
	__u32		     index;
	__u32		     id[2]; /* __u64 would get the alignment wrong */
	__u8		     name[24];
	struct v4l2_fract    frameperiod; /* Frames, not fields */
	__u32		     framelines;
	__u32		     reserved[4];
};

static int get_v4l2_standard32(struct v4l2_standard *kp, struct v4l2_standard32 __user *up)
{
	/* other fields are not set by the user, nor used by the driver */
	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_standard32)) ||
		get_user(kp->index, &up->index))
		return -EFAULT;
	return 0;
}

static int put_v4l2_standard32(struct v4l2_standard *kp, struct v4l2_standard32 __user *up)
{
	if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_standard32)) ||
		put_user(kp->index, &up->index) ||
		copy_to_user(up->id, &kp->id, sizeof(__u64)) ||
		copy_to_user(up->name, kp->name, 24) ||
		copy_to_user(&up->frameperiod, &kp->frameperiod, sizeof(kp->frameperiod)) ||
		put_user(kp->framelines, &up->framelines) ||
		copy_to_user(up->reserved, kp->reserved, 4 * sizeof(__u32)))
			return -EFAULT;
	return 0;
}

struct v4l2_buffer32 {
	__u32			index;
	enum v4l2_buf_type      type;
	__u32			bytesused;
	__u32			flags;
	enum v4l2_field		field;
	struct compat_timeval	timestamp;
	struct v4l2_timecode	timecode;
	__u32			sequence;

	/* memory location */
	enum v4l2_memory        memory;
	union {
		__u32           offset;
		compat_long_t   userptr;
	} m;
	__u32			length;
	__u32			input;
	__u32			reserved;
};

static int get_v4l2_buffer32(struct v4l2_buffer *kp, struct v4l2_buffer32 __user *up)
{

	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_buffer32)) ||
		get_user(kp->index, &up->index) ||
		get_user(kp->type, &up->type) ||
		get_user(kp->flags, &up->flags) ||
		get_user(kp->memory, &up->memory) ||
		get_user(kp->input, &up->input))
			return -EFAULT;
	switch (kp->memory) {
	case V4L2_MEMORY_MMAP:
		break;
	case V4L2_MEMORY_USERPTR:
		{
		compat_long_t tmp;

		if (get_user(kp->length, &up->length) ||
		    get_user(tmp, &up->m.userptr))
			return -EFAULT;

		kp->m.userptr = (unsigned long)compat_ptr(tmp);
		}
		break;
	case V4L2_MEMORY_OVERLAY:
		if (get_user(kp->m.offset, &up->m.offset))
			return -EFAULT;
		break;
	}
	return 0;
}

static int put_v4l2_buffer32(struct v4l2_buffer *kp, struct v4l2_buffer32 __user *up)
{
	if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_buffer32)) ||
		put_user(kp->index, &up->index) ||
		put_user(kp->type, &up->type) ||
		put_user(kp->flags, &up->flags) ||
		put_user(kp->memory, &up->memory) ||
		put_user(kp->input, &up->input))
			return -EFAULT;
	switch (kp->memory) {
	case V4L2_MEMORY_MMAP:
		if (put_user(kp->length, &up->length) ||
			put_user(kp->m.offset, &up->m.offset))
			return -EFAULT;
		break;
	case V4L2_MEMORY_USERPTR:
		if (put_user(kp->length, &up->length) ||
			put_user(kp->m.userptr, &up->m.userptr))
			return -EFAULT;
		break;
	case V4L2_MEMORY_OVERLAY:
		if (put_user(kp->m.offset, &up->m.offset))
			return -EFAULT;
		break;
	}
	if (put_user(kp->bytesused, &up->bytesused) ||
		put_user(kp->field, &up->field) ||
		put_user(kp->timestamp.tv_sec, &up->timestamp.tv_sec) ||
		put_user(kp->timestamp.tv_usec, &up->timestamp.tv_usec) ||
		copy_to_user(&up->timecode, &kp->timecode, sizeof(struct v4l2_timecode)) ||
		put_user(kp->sequence, &up->sequence) ||
		put_user(kp->reserved, &up->reserved))
			return -EFAULT;
	return 0;
}

struct v4l2_framebuffer32 {
	__u32			capability;
	__u32			flags;
	compat_caddr_t 		base;
	struct v4l2_pix_format	fmt;
};

static int get_v4l2_framebuffer32(struct v4l2_framebuffer *kp, struct v4l2_framebuffer32 __user *up)
{
	u32 tmp;

	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_framebuffer32)) ||
		get_user(tmp, &up->base) ||
		get_user(kp->capability, &up->capability) ||
		get_user(kp->flags, &up->flags))
			return -EFAULT;
	kp->base = compat_ptr(tmp);
	get_v4l2_pix_format(&kp->fmt, &up->fmt);
	return 0;
}

static int put_v4l2_framebuffer32(struct v4l2_framebuffer *kp, struct v4l2_framebuffer32 __user *up)
{
	u32 tmp = (u32)((unsigned long)kp->base);

	if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_framebuffer32)) ||
		put_user(tmp, &up->base) ||
		put_user(kp->capability, &up->capability) ||
		put_user(kp->flags, &up->flags))
			return -EFAULT;
	put_v4l2_pix_format(&kp->fmt, &up->fmt);
	return 0;
}

struct v4l2_input32 {
	__u32	     index;		/*  Which input */
	__u8	     name[32];		/*  Label */
	__u32	     type;		/*  Type of input */
	__u32	     audioset;		/*  Associated audios (bitfield) */
	__u32        tuner;             /*  Associated tuner */
	v4l2_std_id  std;
	__u32	     status;
	__u32	     reserved[4];
} __attribute__ ((packed));

/* The 64-bit v4l2_input struct has extra padding at the end of the struct.
   Otherwise it is identical to the 32-bit version. */
static inline int get_v4l2_input32(struct v4l2_input *kp, struct v4l2_input32 __user *up)
{
	if (copy_from_user(kp, up, sizeof(struct v4l2_input32)))
		return -EFAULT;
	return 0;
}

static inline int put_v4l2_input32(struct v4l2_input *kp, struct v4l2_input32 __user *up)
{
	if (copy_to_user(up, kp, sizeof(struct v4l2_input32)))
		return -EFAULT;
	return 0;
}

struct v4l2_ext_controls32 {
       __u32 ctrl_class;
       __u32 count;
       __u32 error_idx;
       __u32 reserved[2];
       compat_caddr_t controls; /* actually struct v4l2_ext_control32 * */
};

static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
{
	struct v4l2_ext_control __user *ucontrols;
	struct v4l2_ext_control __user *kcontrols;
	int n;
	compat_caddr_t p;

	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_ext_controls32)) ||
		get_user(kp->ctrl_class, &up->ctrl_class) ||
		get_user(kp->count, &up->count) ||
		get_user(kp->error_idx, &up->error_idx) ||
		copy_from_user(kp->reserved, up->reserved, sizeof(kp->reserved)))
			return -EFAULT;
	n = kp->count;
	if (n == 0) {
		kp->controls = NULL;
		return 0;
	}
	if (get_user(p, &up->controls))
		return -EFAULT;
	ucontrols = compat_ptr(p);
	if (!access_ok(VERIFY_READ, ucontrols, n * sizeof(struct v4l2_ext_control)))
		return -EFAULT;
	kcontrols = compat_alloc_user_space(n * sizeof(struct v4l2_ext_control));
	kp->controls = kcontrols;
	while (--n >= 0) {
		if (copy_in_user(&kcontrols->id, &ucontrols->id, sizeof(__u32)))
			return -EFAULT;
		if (copy_in_user(&kcontrols->reserved2, &ucontrols->reserved2, sizeof(ucontrols->reserved2)))
			return -EFAULT;
		/* Note: if the void * part of the union ever becomes relevant
		   then we need to know the type of the control in order to do
		   the right thing here. Luckily, that is not yet an issue. */
		if (copy_in_user(&kcontrols->value, &ucontrols->value, sizeof(ucontrols->value)))
			return -EFAULT;
		ucontrols++;
		kcontrols++;
	}
	return 0;
}

static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
{
	struct v4l2_ext_control __user *ucontrols;
	struct v4l2_ext_control __user *kcontrols = kp->controls;
	int n = kp->count;
	compat_caddr_t p;

	if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_ext_controls32)) ||
		put_user(kp->ctrl_class, &up->ctrl_class) ||
		put_user(kp->count, &up->count) ||
		put_user(kp->error_idx, &up->error_idx) ||
		copy_to_user(up->reserved, kp->reserved, sizeof(up->reserved)))
			return -EFAULT;
	if (!kp->count)
		return 0;

	if (get_user(p, &up->controls))
		return -EFAULT;
	ucontrols = compat_ptr(p);
	if (!access_ok(VERIFY_WRITE, ucontrols, n * sizeof(struct v4l2_ext_control)))
		return -EFAULT;

	while (--n >= 0) {
		if (copy_in_user(&ucontrols->id, &kcontrols->id, sizeof(__u32)))
			return -EFAULT;
		if (copy_in_user(&ucontrols->reserved2, &kcontrols->reserved2,
					sizeof(ucontrols->reserved2)))
			return -EFAULT;
		/* Note: if the void * part of the union ever becomes relevant
		   then we need to know the type of the control in order to do
		   the right thing here. Luckily, that is not yet an issue. */
		if (copy_in_user(&ucontrols->value, &kcontrols->value, sizeof(ucontrols->value)))
			return -EFAULT;
		ucontrols++;
		kcontrols++;
	}
	return 0;
}

#define VIDIOC_G_FMT32		_IOWR('V',  4, struct v4l2_format32)
#define VIDIOC_S_FMT32		_IOWR('V',  5, struct v4l2_format32)
#define VIDIOC_QUERYBUF32	_IOWR('V',  9, struct v4l2_buffer32)
#define VIDIOC_G_FBUF32		_IOR ('V', 10, struct v4l2_framebuffer32)
#define VIDIOC_S_FBUF32		_IOW ('V', 11, struct v4l2_framebuffer32)
#define VIDIOC_QBUF32		_IOWR('V', 15, struct v4l2_buffer32)
#define VIDIOC_DQBUF32		_IOWR('V', 17, struct v4l2_buffer32)
#define VIDIOC_ENUMSTD32	_IOWR('V', 25, struct v4l2_standard32)
#define VIDIOC_ENUMINPUT32	_IOWR('V', 26, struct v4l2_input32)
#define VIDIOC_TRY_FMT32      	_IOWR('V', 64, struct v4l2_format32)
#define VIDIOC_G_EXT_CTRLS32    _IOWR('V', 71, struct v4l2_ext_controls32)
#define VIDIOC_S_EXT_CTRLS32    _IOWR('V', 72, struct v4l2_ext_controls32)
#define VIDIOC_TRY_EXT_CTRLS32  _IOWR('V', 73, struct v4l2_ext_controls32)

#define VIDIOC_OVERLAY32	_IOW ('V', 14, s32)
#ifdef __OLD_VIDIOC_
#define VIDIOC_OVERLAY32_OLD	_IOWR('V', 14, s32)
#endif
#define VIDIOC_STREAMON32	_IOW ('V', 18, s32)
#define VIDIOC_STREAMOFF32	_IOW ('V', 19, s32)
#define VIDIOC_G_INPUT32	_IOR ('V', 38, s32)
#define VIDIOC_S_INPUT32	_IOWR('V', 39, s32)
#define VIDIOC_G_OUTPUT32	_IOR ('V', 46, s32)
#define VIDIOC_S_OUTPUT32	_IOWR('V', 47, s32)

static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	union {
#ifdef CONFIG_VIDEO_V4L1_COMPAT
		struct video_tuner vt;
		struct video_buffer vb;
		struct video_window vw;
		struct video_code vc;
		struct video_audio va;
#endif
		struct v4l2_format v2f;
		struct v4l2_buffer v2b;
		struct v4l2_framebuffer v2fb;
		struct v4l2_input v2i;
		struct v4l2_standard v2s;
		struct v4l2_ext_controls v2ecs;
		unsigned long vx;
		int vi;
	} karg;
	void __user *up = compat_ptr(arg);
	int compatible_arg = 1;
	int err = 0;

	/* First, convert the command. */
	switch (cmd) {
#ifdef CONFIG_VIDEO_V4L1_COMPAT
	case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
	case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
	case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
	case VIDIOCSWIN32: cmd = VIDIOCSWIN; break;
	case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
	case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
	case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
	case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
	case VIDIOCSMICROCODE32: cmd = VIDIOCSMICROCODE; break;
#endif
	case VIDIOC_G_FMT32: cmd = VIDIOC_G_FMT; break;
	case VIDIOC_S_FMT32: cmd = VIDIOC_S_FMT; break;
	case VIDIOC_QUERYBUF32: cmd = VIDIOC_QUERYBUF; break;
	case VIDIOC_G_FBUF32: cmd = VIDIOC_G_FBUF; break;
	case VIDIOC_S_FBUF32: cmd = VIDIOC_S_FBUF; break;
	case VIDIOC_QBUF32: cmd = VIDIOC_QBUF; break;
	case VIDIOC_DQBUF32: cmd = VIDIOC_DQBUF; break;
	case VIDIOC_ENUMSTD32: cmd = VIDIOC_ENUMSTD; break;
	case VIDIOC_ENUMINPUT32: cmd = VIDIOC_ENUMINPUT; break;
	case VIDIOC_TRY_FMT32: cmd = VIDIOC_TRY_FMT; break;
	case VIDIOC_G_EXT_CTRLS32: cmd = VIDIOC_G_EXT_CTRLS; break;
	case VIDIOC_S_EXT_CTRLS32: cmd = VIDIOC_S_EXT_CTRLS; break;
	case VIDIOC_TRY_EXT_CTRLS32: cmd = VIDIOC_TRY_EXT_CTRLS; break;
	case VIDIOC_OVERLAY32: cmd = VIDIOC_OVERLAY; break;
#ifdef __OLD_VIDIOC_
	case VIDIOC_OVERLAY32_OLD: cmd = VIDIOC_OVERLAY; break;
#endif
	case VIDIOC_STREAMON32: cmd = VIDIOC_STREAMON; break;
	case VIDIOC_STREAMOFF32: cmd = VIDIOC_STREAMOFF; break;
	case VIDIOC_G_INPUT32: cmd = VIDIOC_G_INPUT; break;
	case VIDIOC_S_INPUT32: cmd = VIDIOC_S_INPUT; break;
	case VIDIOC_G_OUTPUT32: cmd = VIDIOC_G_OUTPUT; break;
	case VIDIOC_S_OUTPUT32: cmd = VIDIOC_S_OUTPUT; break;
	}

	switch (cmd) {
#ifdef CONFIG_VIDEO_V4L1_COMPAT
	case VIDIOCSTUNER:
	case VIDIOCGTUNER:
		err = get_video_tuner32(&karg.vt, up);
		compatible_arg = 0;
		break;

	case VIDIOCSFBUF:
		err = get_video_buffer32(&karg.vb, up);
		compatible_arg = 0;
		break;

	case VIDIOCSWIN:
		err = get_video_window32(&karg.vw, up);
		compatible_arg = 0;
		break;

	case VIDIOCGWIN:
	case VIDIOCGFBUF:
	case VIDIOCGFREQ:
		compatible_arg = 0;
		break;

	case VIDIOCSMICROCODE:
		err = get_microcode32(&karg.vc, up);
		compatible_arg = 0;
		break;

	case VIDIOCSFREQ:
		err = get_user(karg.vx, (u32 __user *)up);
		compatible_arg = 0;
		break;

	case VIDIOCCAPTURE:
	case VIDIOCSYNC:
	case VIDIOCSWRITEMODE:
#endif
	case VIDIOC_OVERLAY:
	case VIDIOC_STREAMON:
	case VIDIOC_STREAMOFF:
	case VIDIOC_S_INPUT:
	case VIDIOC_S_OUTPUT:
		err = get_user(karg.vi, (s32 __user *)up);
		compatible_arg = 0;
		break;

	case VIDIOC_G_INPUT:
	case VIDIOC_G_OUTPUT:
		compatible_arg = 0;
		break;

	case VIDIOC_G_FMT:
	case VIDIOC_S_FMT:
	case VIDIOC_TRY_FMT:
		err = get_v4l2_format32(&karg.v2f, up);
		compatible_arg = 0;
		break;

	case VIDIOC_QUERYBUF:
	case VIDIOC_QBUF:
	case VIDIOC_DQBUF:
		err = get_v4l2_buffer32(&karg.v2b, up);
		compatible_arg = 0;
		break;

	case VIDIOC_S_FBUF:
		err = get_v4l2_framebuffer32(&karg.v2fb, up);
		compatible_arg = 0;
		break;

	case VIDIOC_G_FBUF:
		compatible_arg = 0;
		break;

	case VIDIOC_ENUMSTD:
		err = get_v4l2_standard32(&karg.v2s, up);
		compatible_arg = 0;
		break;

	case VIDIOC_ENUMINPUT:
		err = get_v4l2_input32(&karg.v2i, up);
		compatible_arg = 0;
		break;

	case VIDIOC_G_EXT_CTRLS:
	case VIDIOC_S_EXT_CTRLS:
	case VIDIOC_TRY_EXT_CTRLS:
		err = get_v4l2_ext_controls32(&karg.v2ecs, up);
		compatible_arg = 0;
		break;
	}
	if (err)
		return err;

	if (compatible_arg)
		err = native_ioctl(file, cmd, (unsigned long)up);
	else {
		mm_segment_t old_fs = get_fs();

		set_fs(KERNEL_DS);
		err = native_ioctl(file, cmd, (unsigned long)&karg);
		set_fs(old_fs);
	}

	/* Special case: even after an error we need to put the
	   results back for these ioctls since the error_idx will
	   contain information on which control failed. */
	switch (cmd) {
	case VIDIOC_G_EXT_CTRLS:
	case VIDIOC_S_EXT_CTRLS:
	case VIDIOC_TRY_EXT_CTRLS:
		if (put_v4l2_ext_controls32(&karg.v2ecs, up))
			err = -EFAULT;
		break;
	}
	if (err)
		return err;

	switch (cmd) {
#ifdef CONFIG_VIDEO_V4L1_COMPAT
	case VIDIOCGTUNER:
		err = put_video_tuner32(&karg.vt, up);
		break;

	case VIDIOCGWIN:
		err = put_video_window32(&karg.vw, up);
		break;

	case VIDIOCGFBUF:
		err = put_video_buffer32(&karg.vb, up);
		break;

	case VIDIOCGFREQ:
		err = put_user(((u32)karg.vx), (u32 __user *)up);
		break;
#endif
	case VIDIOC_S_INPUT:
	case VIDIOC_S_OUTPUT:
	case VIDIOC_G_INPUT:
	case VIDIOC_G_OUTPUT:
		err = put_user(((s32)karg.vi), (s32 __user *)up);
		break;

	case VIDIOC_G_FBUF:
		err = put_v4l2_framebuffer32(&karg.v2fb, up);
		break;

	case VIDIOC_G_FMT:
	case VIDIOC_S_FMT:
	case VIDIOC_TRY_FMT:
		err = put_v4l2_format32(&karg.v2f, up);
		break;

	case VIDIOC_QUERYBUF:
	case VIDIOC_QBUF:
	case VIDIOC_DQBUF:
		err = put_v4l2_buffer32(&karg.v2b, up);
		break;

	case VIDIOC_ENUMSTD:
		err = put_v4l2_standard32(&karg.v2s, up);
		break;

	case VIDIOC_ENUMINPUT:
		err = put_v4l2_input32(&karg.v2i, up);
		break;
	}
	return err;
}

long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
{
	int ret = -ENOIOCTLCMD;

	if (!file->f_op->ioctl && !file->f_op->unlocked_ioctl)
		return ret;

	switch (cmd) {
#ifdef CONFIG_VIDEO_V4L1_COMPAT
	case VIDIOCGCAP:
	case VIDIOCGCHAN:
	case VIDIOCSCHAN:
	case VIDIOCGTUNER32:
	case VIDIOCSTUNER32:
	case VIDIOCGPICT:
	case VIDIOCSPICT:
	case VIDIOCCAPTURE32:
	case VIDIOCGWIN32:
	case VIDIOCSWIN32:
	case VIDIOCGFBUF32:
	case VIDIOCSFBUF32:
	case VIDIOCKEY:
	case VIDIOCGFREQ32:
	case VIDIOCSFREQ32:
	case VIDIOCGAUDIO:
	case VIDIOCSAUDIO:
	case VIDIOCSYNC32:
	case VIDIOCMCAPTURE:
	case VIDIOCGMBUF:
	case VIDIOCGUNIT:
	case VIDIOCGCAPTURE:
	case VIDIOCSCAPTURE:
	case VIDIOCSPLAYMODE:
	case VIDIOCSWRITEMODE32:
	case VIDIOCGPLAYINFO:
	case VIDIOCSMICROCODE32:
	case VIDIOCGVBIFMT:
	case VIDIOCSVBIFMT:
#endif
#ifdef __OLD_VIDIOC_
	case VIDIOC_OVERLAY32_OLD:
	case VIDIOC_S_PARM_OLD:
	case VIDIOC_S_CTRL_OLD:
	case VIDIOC_G_AUDIO_OLD:
	case VIDIOC_G_AUDOUT_OLD:
	case VIDIOC_CROPCAP_OLD:
#endif
	case VIDIOC_QUERYCAP:
	case VIDIOC_RESERVED:
	case VIDIOC_ENUM_FMT:
	case VIDIOC_G_FMT32:
	case VIDIOC_S_FMT32:
	case VIDIOC_REQBUFS:
	case VIDIOC_QUERYBUF32:
	case VIDIOC_G_FBUF32:
	case VIDIOC_S_FBUF32:
	case VIDIOC_OVERLAY32:
	case VIDIOC_QBUF32:
	case VIDIOC_DQBUF32:
	case VIDIOC_STREAMON32:
	case VIDIOC_STREAMOFF32:
	case VIDIOC_G_PARM:
	case VIDIOC_S_PARM:
	case VIDIOC_G_STD:
	case VIDIOC_S_STD:
	case VIDIOC_ENUMSTD32:
	case VIDIOC_ENUMINPUT32:
	case VIDIOC_G_CTRL:
	case VIDIOC_S_CTRL:
	case VIDIOC_G_TUNER:
	case VIDIOC_S_TUNER:
	case VIDIOC_G_AUDIO:
	case VIDIOC_S_AUDIO:
	case VIDIOC_QUERYCTRL:
	case VIDIOC_QUERYMENU:
	case VIDIOC_G_INPUT32:
	case VIDIOC_S_INPUT32:
	case VIDIOC_G_OUTPUT32:
	case VIDIOC_S_OUTPUT32:
	case VIDIOC_ENUMOUTPUT:
	case VIDIOC_G_AUDOUT:
	case VIDIOC_S_AUDOUT:
	case VIDIOC_G_MODULATOR:
	case VIDIOC_S_MODULATOR:
	case VIDIOC_S_FREQUENCY:
	case VIDIOC_G_FREQUENCY:
	case VIDIOC_CROPCAP:
	case VIDIOC_G_CROP:
	case VIDIOC_S_CROP:
	case VIDIOC_G_JPEGCOMP:
	case VIDIOC_S_JPEGCOMP:
	case VIDIOC_QUERYSTD:
	case VIDIOC_TRY_FMT32:
	case VIDIOC_ENUMAUDIO:
	case VIDIOC_ENUMAUDOUT:
	case VIDIOC_G_PRIORITY:
	case VIDIOC_S_PRIORITY:
	case VIDIOC_G_SLICED_VBI_CAP:
	case VIDIOC_LOG_STATUS:
	case VIDIOC_G_EXT_CTRLS32:
	case VIDIOC_S_EXT_CTRLS32:
	case VIDIOC_TRY_EXT_CTRLS32:
	case VIDIOC_ENUM_FRAMESIZES:
	case VIDIOC_ENUM_FRAMEINTERVALS:
	case VIDIOC_G_ENC_INDEX:
	case VIDIOC_ENCODER_CMD:
	case VIDIOC_TRY_ENCODER_CMD:
	case VIDIOC_DBG_S_REGISTER:
	case VIDIOC_DBG_G_REGISTER:
	case VIDIOC_G_CHIP_IDENT:
	case VIDIOC_S_HW_FREQ_SEEK:
		ret = do_video_ioctl(file, cmd, arg);
		break;

#ifdef CONFIG_VIDEO_V4L1_COMPAT
	/* BTTV specific... */
	case _IOW('v',  BASE_VIDIOCPRIVATE+0, char [256]):
	case _IOR('v',  BASE_VIDIOCPRIVATE+1, char [256]):
	case _IOR('v' , BASE_VIDIOCPRIVATE+2, unsigned int):
	case _IOW('v' , BASE_VIDIOCPRIVATE+3, char [16]): /* struct bttv_pll_info */
	case _IOR('v' , BASE_VIDIOCPRIVATE+4, int):
	case _IOR('v' , BASE_VIDIOCPRIVATE+5, int):
	case _IOR('v' , BASE_VIDIOCPRIVATE+6, int):
	case _IOR('v' , BASE_VIDIOCPRIVATE+7, int):
		ret = native_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
		break;
#endif
	default:
		v4l_print_ioctl("compat_ioctl32", cmd);
		printk(KERN_CONT "\n");
		break;
	}
	return ret;
}
#else
long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
{
	return -ENOIOCTLCMD;
}
#endif
EXPORT_SYMBOL_GPL(v4l_compat_ioctl32);

MODULE_LICENSE("GPL");
pan class="hl opt">(node, struct dentry, d_u.d_child); list_del_init(node); if (d->d_inode) { /* This should never be called on a cgroup * directory with child cgroups */ BUG_ON(d->d_inode->i_mode & S_IFDIR); d = dget_locked(d); spin_unlock(&dcache_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); spin_lock(&dcache_lock); } node = dentry->d_subdirs.next; } spin_unlock(&dcache_lock); } /* * NOTE : the dentry must have been dget()'ed */ static void cgroup_d_remove_dir(struct dentry *dentry) { cgroup_clear_directory(dentry); spin_lock(&dcache_lock); list_del_init(&dentry->d_u.d_child); spin_unlock(&dcache_lock); remove_dir(dentry); } static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { unsigned long added_bits, removed_bits; struct cgroup *cgrp = &root->top_cgroup; int i; removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; if (ss->root != &rootnode) { /* Subsystem isn't free */ return -EBUSY; } } /* Currently we don't handle adding/removing subsystems when * any child cgroups exist. This is theoretically supportable * but involves complex error handling, so it's being left until * later */ if (root->number_of_cgroups > 1) return -EBUSY; /* Process each subsystem */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; unsigned long bit = 1UL << i; if (bit & added_bits) { /* We're binding this subsystem to this hierarchy */ BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_add(&ss->sibling, &root->subsys_list); rcu_assign_pointer(ss->root, root); if (ss->bind) ss->bind(ss, cgrp); } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); if (ss->bind) ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; rcu_assign_pointer(subsys[i]->root, &rootnode); list_del(&ss->sibling); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); } else { /* Subsystem state shouldn't exist */ BUG_ON(cgrp->subsys[i]); } } root->subsys_bits = root->actual_subsys_bits = final_bits; synchronize_rcu(); return 0; } static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); mutex_unlock(&cgroup_mutex); return 0; } struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; }; /* Convert a hierarchy specifier into a bitmask of subsystems and * flags. */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data ?: "all"; opts->subsys_bits = 0; opts->flags = 0; opts->release_agent = NULL; while ((token = strsep(&o, ",")) != NULL) { if (!*token) return -EINVAL; if (!strcmp(token, "all")) { /* Add all non-disabled subsystems */ int i; opts->subsys_bits = 0; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!ss->disabled) opts->subsys_bits |= 1ul << i; } } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; strncpy(opts->release_agent, token + 14, PATH_MAX - 1); opts->release_agent[PATH_MAX - 1] = 0; } else { struct cgroup_subsys *ss; int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; if (!strcmp(token, ss->name)) { if (!ss->disabled) set_bit(i, &opts->subsys_bits); break; } } if (i == CGROUP_SUBSYS_COUNT) return -ENOENT; } } /* We can't have an empty hierarchy */ if (!opts->subsys_bits) return -EINVAL; return 0; } static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; /* Don't allow flags to change at remount */ if (opts.flags != root->flags) { ret = -EINVAL; goto out_unlock; } ret = rebind_subsystems(root, opts.subsys_bits); /* (re)populate subsystem files */ if (!ret) cgroup_populate_dir(cgrp); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); out_unlock: if (opts.release_agent) kfree(opts.release_agent); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; } static struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, .remount_fs = cgroup_remount, }; static void init_cgroup_housekeeping(struct cgroup *cgrp) { INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); init_rwsem(&cgrp->pids_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; init_cgroup_housekeeping(cgrp); } static int cgroup_test_super(struct super_block *sb, void *data) { struct cgroupfs_root *new = data; struct cgroupfs_root *root = sb->s_fs_info; /* First check subsystems */ if (new->subsys_bits != root->subsys_bits) return 0; /* Next check flags */ if (new->flags != root->flags) return 0; return 1; } static int cgroup_set_super(struct super_block *sb, void *data) { int ret; struct cgroupfs_root *root = data; ret = set_anon_super(sb, NULL); if (ret) return ret; sb->s_fs_info = root; root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = CGROUP_SUPER_MAGIC; sb->s_op = &cgroup_ops; return 0; } static int cgroup_get_rootdir(struct super_block *sb) { struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); struct dentry *dentry; if (!inode) return -ENOMEM; inode->i_fop = &simple_dir_operations; inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); dentry = d_alloc_root(inode); if (!dentry) { iput(inode); return -ENOMEM; } sb->s_root = dentry; return 0; } static int cgroup_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; int ret = 0; struct super_block *sb; struct cgroupfs_root *root; struct list_head tmp_cg_links; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); if (ret) { if (opts.release_agent) kfree(opts.release_agent); return ret; } root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { if (opts.release_agent) kfree(opts.release_agent); return -ENOMEM; } init_cgroup_root(root); root->subsys_bits = opts.subsys_bits; root->flags = opts.flags; if (opts.release_agent) { strcpy(root->release_agent_path, opts.release_agent); kfree(opts.release_agent); } sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); if (IS_ERR(sb)) { kfree(root); return PTR_ERR(sb); } if (sb->s_fs_info != root) { /* Reusing an existing superblock */ BUG_ON(sb->s_root == NULL); kfree(root); root = NULL; } else { /* New superblock */ struct cgroup *cgrp = &root->top_cgroup; struct inode *inode; int i; BUG_ON(sb->s_root != NULL); ret = cgroup_get_rootdir(sb); if (ret) goto drop_new_super; inode = sb->s_root->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be * increased by someone holding cgroup_lock, and * that's us. The worst that can happen is that we * have some link structures left over */ ret = allocate_cg_links(css_set_count, &tmp_cg_links); if (ret) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); goto drop_new_super; } ret = rebind_subsystems(root, root->subsys_bits); if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); goto free_cg_links; } /* EBUSY should be the only error here */ BUG_ON(ret); list_add(&root->root_list, &roots); root_count++; sb->s_root->d_fsdata = &root->top_cgroup; root->top_cgroup.dentry = sb->s_root; /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { struct hlist_head *hhead = &css_set_table[i]; struct hlist_node *node; struct css_set *cg; hlist_for_each_entry(cg, node, hhead, hlist) { struct cg_cgroup_link *link; BUG_ON(list_empty(&tmp_cg_links)); link = list_entry(tmp_cg_links.next, struct cg_cgroup_link, cgrp_link_list); list_del(&link->cgrp_link_list); link->cg = cg; list_add(&link->cgrp_link_list, &root->top_cgroup.css_sets); list_add(&link->cg_link_list, &cg->cg_links); } } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); BUG_ON(!list_empty(&cgrp->sibling)); BUG_ON(!list_empty(&cgrp->children)); BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(cgrp); mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); } return simple_set_mnt(mnt, sb); free_cg_links: free_cg_links(&tmp_cg_links); drop_new_super: up_write(&sb->s_umount); deactivate_super(sb); return ret; } static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; int ret; struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; BUG_ON(!root); BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); /* Rebind all subsystems back to the default hierarchy */ ret = rebind_subsystems(root, 0); /* Shouldn't be able to fail ... */ BUG_ON(ret); /* * Release all the links from css_sets to this hierarchy's * root cgroup */ write_lock(&css_set_lock); list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, cgrp_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); } write_unlock(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); root_count--; } mutex_unlock(&cgroup_mutex); kfree(root); kill_litter_super(sb); } static struct file_system_type cgroup_fs_type = { .name = "cgroup", .get_sb = cgroup_get_sb, .kill_sb = cgroup_kill_sb, }; static inline struct cgroup *__d_cgrp(struct dentry *dentry) { return dentry->d_fsdata; } static inline struct cftype *__d_cft(struct dentry *dentry) { return dentry->d_fsdata; } /** * cgroup_path - generate the path of a cgroup * @cgrp: the cgroup in question * @buf: the buffer to write the path into * @buflen: the length of the buffer * * Called with cgroup_mutex held. Writes path of cgroup into buf. * Returns 0 on success, -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; if (cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup */ strcpy(buf, "/"); return 0; } start = buf + buflen; *--start = '\0'; for (;;) { int len = cgrp->dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cgrp->dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; if (!cgrp->parent) continue; if (--start < buf) return -ENAMETOOLONG; *start = '/'; } memmove(buf, start, buf + buflen - start); return 0; } /* * Return the first subsystem attached to a cgroup's hierarchy, and * its subsystem id. */ static void get_first_subsys(const struct cgroup *cgrp, struct cgroup_subsys_state **css, int *subsys_id) { const struct cgroupfs_root *root = cgrp->root; const struct cgroup_subsys *test_ss; BUG_ON(list_empty(&root->subsys_list)); test_ss = list_entry(root->subsys_list.next, struct cgroup_subsys, sibling); if (css) { *css = cgrp->subsys[test_ss->subsys_id]; BUG_ON(!*css); } if (subsys_id) *subsys_id = test_ss->subsys_id; } /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to * @tsk: the task to be attached * * Call holding cgroup_mutex. May take task_lock of * the task 'tsk' during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { int retval = 0; struct cgroup_subsys *ss; struct cgroup *oldcgrp; struct css_set *cg = tsk->cgroups; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; int subsys_id; get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ oldcgrp = task_cgroup(tsk, subsys_id); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { retval = ss->can_attach(ss, cgrp, tsk); if (retval) return retval; } } /* * Locate or allocate a new css_set for this task, * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); if (!newcg) return -ENOMEM; task_lock(tsk); if (tsk->flags & PF_EXITING) { task_unlock(tsk); put_css_set(newcg); return -ESRCH; } rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) { list_del(&tsk->cg_list); list_add(&tsk->cg_list, &newcg->tasks); } write_unlock(&css_set_lock); for_each_subsys(root, ss) { if (ss->attach) ss->attach(ss, cgrp, oldcgrp, tsk); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); put_css_set(cg); return 0; } /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk || tsk->flags & PF_EXITING) { rcu_read_unlock(); return -ESRCH; } tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { tsk = current; get_task_struct(tsk); } ret = cgroup_attach_task(cgrp, tsk); put_task_struct(tsk); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { int ret; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; ret = attach_task_by_pid(cgrp, pid); cgroup_unlock(); return ret; } /* The various types of files and directories in a cgroup file system */ enum cgroup_filetype { FILE_ROOT, FILE_DIR, FILE_TASKLIST, FILE_NOTIFY_ON_RELEASE, FILE_RELEASE_AGENT, }; /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness * * On success, returns true; the lock should be later released with * cgroup_unlock(). On failure returns false with no lock held. */ bool cgroup_lock_live_group(struct cgroup *cgrp) { mutex_lock(&cgroup_mutex); if (cgroup_is_removed(cgrp)) { mutex_unlock(&cgroup_mutex); return false; } return true; } static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(cgrp)) return -ENODEV; strcpy(cgrp->root->release_agent_path, buffer); cgroup_unlock(); return 0; } static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, struct seq_file *seq) { if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); seq_putc(seq, '\n'); cgroup_unlock(); return 0; } /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; char *end; if (!nbytes) return -EINVAL; if (nbytes >= sizeof(buffer)) return -E2BIG; if (copy_from_user(buffer, userbuf, nbytes)) return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); if (cft->write_u64) { u64 val = simple_strtoull(buffer, &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { s64 val = simple_strtoll(buffer, &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); } if (!retval) retval = nbytes; return retval; } static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; size_t max_bytes = cft->max_write_len; char *buffer = local_buffer; if (!max_bytes) max_bytes = sizeof(local_buffer) - 1; if (nbytes >= max_bytes) return -E2BIG; /* Allocate a dynamic buffer if we need one */ if (nbytes >= sizeof(local_buffer)) { buffer = kmalloc(nbytes + 1, GFP_KERNEL); if (buffer == NULL) return -ENOMEM; } if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { retval = -EFAULT; goto out; } buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); retval = cft->write_string(cgrp, cft, buffer); if (!retval) retval = nbytes; out: if (buffer != local_buffer) kfree(buffer); return retval; } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); if (!cft || cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_string) return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); if (cft->trigger) { int ret = cft->trigger(cgrp, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; s64 val = cft->read_s64(cgrp, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); if (!cft || cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); if (cft->read_u64) return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); if (cft->read_s64) return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); return -EINVAL; } /* * seqfile ops/methods for returning structured data. Currently just * supports string->u64 maps, but can be extended in future. */ struct cgroup_seqfile_state { struct cftype *cft; struct cgroup *cgroup; }; static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cgroup_seqfile_state *state = m->private; struct cftype *cft = state->cft; if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; return cft->read_map(state->cgroup, cft, &cb); } return cft->read_seq_string(state->cgroup, cft, m); } static int cgroup_seqfile_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); return single_release(inode, file); } static struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, .release = cgroup_seqfile_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { int err; struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; if (cft->read_map || cft->read_seq_string) { struct cgroup_seqfile_state *state = kzalloc(sizeof(*state), GFP_USER); if (!state) return -ENOMEM; state->cft = cft; state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; err = single_open(file, cgroup_seqfile_show, state); if (err < 0) kfree(state); } else if (cft->open) err = cft->open(inode, file); else err = 0; return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { struct cftype *cft = __d_cft(file->f_dentry); if (cft->release) return cft->release(inode, file); return 0; } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { if (!S_ISDIR(old_dentry->d_inode->i_mode)) return -ENOTDIR; if (new_dentry->d_inode) return -EEXIST; if (old_dir != new_dir) return -EIO; return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } static struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, .open = cgroup_file_open, .release = cgroup_file_release, }; static struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, }; static int cgroup_create_file(struct dentry *dentry, int mode, struct super_block *sb) { static struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, }; struct inode *inode; if (!dentry) return -ENOENT; if (dentry->d_inode) return -EEXIST; inode = cgroup_new_inode(mode, sb); if (!inode) return -ENOMEM; if (S_ISDIR(mode)) { inode->i_op = &cgroup_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); /* start with the directory inode held, so that we can * populate it without racing with another mkdir */ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; } dentry->d_op = &cgroup_dops; d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return 0; } /* * cgroup_create_dir - create a directory for an object. * @cgrp: the cgroup we create the directory for. It must have a valid * ->parent field. And we are going to fill its ->dentry field. * @dentry: dentry of the new cgroup * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, int mode) { struct dentry *parent; int error = 0; parent = cgrp->parent->dentry; error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); if (!error) { dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); cgrp->dentry = dentry; dget(dentry); } dput(dentry); return error; } int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, const struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct dentry *dentry; int error; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); strcat(name, "."); } strcat(name, cft->name); BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); dentry = lookup_one_len(name, dir, strlen(name)); if (!IS_ERR(dentry)) { error = cgroup_create_file(dentry, 0644 | S_IFREG, cgrp->root->sb); if (!error) dentry->d_fsdata = (void *)cft; dput(dentry); } else error = PTR_ERR(dentry); return error; } int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, const struct cftype cft[], int count) { int i, err; for (i = 0; i < count; i++) { err = cgroup_add_file(cgrp, subsys, &cft[i]); if (err) return err; } return 0; } /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * * Return the number of tasks in the cgroup. */ int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cg_cgroup_link *link; read_lock(&css_set_lock); list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { count += atomic_read(&link->cg->refcount); } read_unlock(&css_set_lock); return count; } /* * Advance a list_head iterator. The iterator should be positioned at * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; struct css_set *cg; /* Advance to the next non-empty css_set */ do { l = l->next; if (l == &cgrp->css_sets) { it->cg_link = NULL; return; } link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); cg = link->cg; } while (list_empty(&cg->tasks)); it->cg_link = l; it->task = cg->tasks.next; } /* * To reduce the fork() overhead for systems that are not actually * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually * used - in other words after the first call to cgroup_iter_start(). * * The tasklist_lock is not held here, as do_each_thread() and * while_each_thread() are protected by RCU. */ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; do_each_thread(g, p) { task_lock(p); /* * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. */ if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); write_unlock(&css_set_lock); } void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) { /* * The first time anyone tries to iterate across a cgroup, * we need to enable the list linking each css_set to its * tasks, and fix up all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); it->cg_link = &cgrp->css_sets; cgroup_advance_iter(cgrp, it); } struct task_struct *cgroup_iter_next(struct cgroup *cgrp, struct cgroup_iter *it) { struct task_struct *res; struct list_head *l = it->task; /* If the iterator cg is NULL, we have no tasks */ if (!it->cg_link) return NULL; res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; if (l == &res->cgroups->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); } else { it->task = l; } return res; } void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) { read_unlock(&css_set_lock); } static inline int started_after_time(struct task_struct *t1, struct timespec *time, struct task_struct *t2) { int start_diff = timespec_compare(&t1->start_time, time); if (start_diff > 0) { return 1; } else if (start_diff < 0) { return 0; } else { /* * Arbitrarily, if two processes started at the same * time, we'll say that the lower pointer value * started first. Note that t2 may have exited by now * so this may not be a valid pointer any longer, but * that's fine - it still serves to distinguish * between two tasks started (effectively) simultaneously. */ return t1 > t2; } } /* * This function is a callback from heap_insert() and is used to order * the heap. * In this case we order the heap in descending task start time. */ static inline int started_after(void *p1, void *p2) { struct task_struct *t1 = p1; struct task_struct *t2 = p2; return started_after_time(t1, &t2->start_time, t2); } /** * cgroup_scan_tasks - iterate though all the tasks in a cgroup * @scan: struct cgroup_scanner containing arguments for the scan * * Arguments include pointers to callback functions test_task() and * process_task(). * Iterate through all the tasks in a cgroup, calling test_task() for each, * and if it returns true, call process_task() for it also. * The test_task pointer may be NULL, meaning always true (select all tasks). * Effectively duplicates cgroup_iter_{start,next,end}() * but does not lock css_set_lock for the call to process_task(). * The struct cgroup_scanner may be embedded in any structure of the caller's * creation. * It is guaranteed that process_task() will act on every task that * is a member of the cgroup for the duration of this call. This * function may or may not call process_task() for tasks that exit * or move to a different cgroup during the call, or are forked or * move into the cgroup during the call. * * Note that test_task() may be called with locks held, and may in some * situations be called multiple times for the same task, so it should * be cheap. * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been * pre-allocated and will be used for heap operations (and its "gt" member will * be overwritten), else a temporary heap will be used (allocation of which * may cause this function to fail). */ int cgroup_scan_tasks(struct cgroup_scanner *scan) { int retval, i; struct cgroup_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; if (scan->heap) { /* The caller supplied our heap and pre-allocated its memory */ heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ heap = &tmp_heap; retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); if (retval) /* cannot allocate the heap */ return retval; } again: /* * Scan tasks in the cgroup, using the scanner's "test_task" callback * to determine which are of interest, and using the scanner's * "process_task" callback to process any of them that need an update. * Since we don't want to hold any locks during the task updates, * gather tasks to be processed in a heap structure. * The heap is sorted by descending task start time. * If the statically-sized heap fills up, we overflow tasks that * started later, and in future iterations only consider tasks that * started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; cgroup_iter_start(scan->cg, &it); while ((p = cgroup_iter_next(scan->cg, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ if (scan->test_task && !scan->test_task(p, scan)) continue; /* * Only process tasks that started after the last task * we processed */ if (!started_after_time(p, &latest_time, latest_task)) continue; dropped = heap_insert(heap, p); if (dropped == NULL) { /* * The new task was inserted; the heap wasn't * previously full */ get_task_struct(p); } else if (dropped != p) { /* * The new task was inserted, and pushed out a * different task */ get_task_struct(p); put_task_struct(dropped); } /* * Else the new task was newer than anything already in * the heap and wasn't inserted */ } cgroup_iter_end(scan->cg, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { struct task_struct *q = heap->ptrs[i]; if (i == 0) { latest_time = q->start_time; latest_task = q; } /* Process the task per the caller's callback */ scan->process_task(q, scan); put_task_struct(q); } /* * If we had to process any tasks at all, scan again * in case some of them were in the middle of forking * children that didn't get processed. * Not the most efficient way to do it, but it avoids * having to take callback_mutex in the fork path */ goto again; } if (heap == &tmp_heap) heap_free(&tmp_heap); return 0; } /* * Stuff for reading the 'tasks' file. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* * Load into 'pidarray' up to 'npids' of the tasks using cgroup * 'cgrp'. Return actual number of pids loaded. No need to * task_lock(p) when reading out p->cgroup, since we're in an RCU * read section, so the css_set can't go away, and is * immutable after creation. */ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { int n = 0; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; pidarray[n++] = task_pid_vnr(tsk); } cgroup_iter_end(cgrp, &it); return n; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; struct cgroup_iter it; struct task_struct *tsk; /* * Validate dentry by checking the superblock operations, * and make sure it's a directory. */ if (dentry->d_sb->s_op != &cgroup_ops || !S_ISDIR(dentry->d_inode->i_mode)) goto err; ret = 0; cgrp = dentry->d_fsdata; rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (delayacct_is_task_waiting_on_io(tsk)) stats->nr_io_wait++; break; } } cgroup_iter_end(cgrp, &it); rcu_read_unlock(); err: return ret; } static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } /* * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->tasks_pids array. */ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct cgroup *cgrp = s->private; int index = 0, pid = *pos; int *iter; down_read(&cgrp->pids_mutex); if (pid) { int end = cgrp->pids_length; while (index < end) { int mid = (index + end) / 2; if (cgrp->tasks_pids[mid] == pid) { index = mid; break; } else if (cgrp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= cgrp->pids_length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = cgrp->tasks_pids + index; *pos = *iter; return iter; } static void cgroup_tasks_stop(struct seq_file *s, void *v) { struct cgroup *cgrp = s->private; up_read(&cgrp->pids_mutex); } static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { struct cgroup *cgrp = s->private; int *p = v; int *end = cgrp->tasks_pids + cgrp->pids_length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { return NULL; } else { *pos = *p; return p; } } static int cgroup_tasks_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } static struct seq_operations cgroup_tasks_seq_operations = { .start = cgroup_tasks_start, .stop = cgroup_tasks_stop, .next = cgroup_tasks_next, .show = cgroup_tasks_show, }; static void release_cgroup_pid_array(struct cgroup *cgrp) { down_write(&cgrp->pids_mutex); BUG_ON(!cgrp->pids_use_count); if (!--cgrp->pids_use_count) { kfree(cgrp->tasks_pids); cgrp->tasks_pids = NULL; cgrp->pids_length = 0; } up_write(&cgrp->pids_mutex); } static int cgroup_tasks_release(struct inode *inode, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); if (!(file->f_mode & FMODE_READ)) return 0; release_cgroup_pid_array(cgrp); return seq_release(inode, file); } static struct file_operations cgroup_tasks_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, .release = cgroup_tasks_release, }; /* * Handle an open on 'tasks' file. Prepare an array containing the * process id's of tasks currently attached to the cgroup being opened. */ static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); pid_t *pidarray; int npids; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ npids = cgroup_task_count(cgrp); pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); if (!pidarray) return -ENOMEM; npids = pid_array_load(pidarray, npids, cgrp); sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); /* * Store the array in the cgroup, freeing the old * array if necessary */ down_write(&cgrp->pids_mutex); kfree(cgrp->tasks_pids); cgrp->tasks_pids = pidarray; cgrp->pids_length = npids; cgrp->pids_use_count++; up_write(&cgrp->pids_mutex); file->f_op = &cgroup_tasks_operations; retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { release_cgroup_pid_array(cgrp); return retval; } ((struct seq_file *)file->private_data)->private = cgrp; return 0; } static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) { return notify_on_release(cgrp); } static int cgroup_write_notify_on_release(struct cgroup *cgrp, struct cftype *cft, u64 val) { clear_bit(CGRP_RELEASABLE, &cgrp->flags); if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); return 0; } /* * for the common functions, 'private' gives the type of file */ static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, .private = FILE_NOTIFY_ON_RELEASE, }, }; static struct cftype cft_release_agent = { .name = "release_agent", .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) { int err; struct cgroup_subsys *ss; /* First clear out any existing files */ cgroup_clear_directory(cgrp->dentry); err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); if (err < 0) return err; if (cgrp == cgrp->top_cgroup) { if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) return err; } for_each_subsys(cgrp->root, ss) { if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) return err; } return 0; } static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { css->cgroup = cgrp; atomic_set(&css->refcnt, 0); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; } /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup * @dentry: dentry of the new cgroup * @mode: mode to set on new inode * * Must be called with the mutex on the parent inode held */ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode) { struct cgroup *cgrp; struct cgroupfs_root *root = parent->root; int err = 0; struct cgroup_subsys *ss; struct super_block *sb = root->sb; cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't * disappear while someone has an open control file on the * fs */ atomic_inc(&sb->s_active); mutex_lock(&cgroup_mutex); init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); } list_add(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); if (err < 0) goto err_remove; /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); err = cgroup_populate_dir(cgrp); /* If err < 0, we have a half-filled directory - oh well ;) */ mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; err_remove: list_del(&cgrp->sibling); root->number_of_cgroups--; err_destroy: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) ss->destroy(ss, cgrp); } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); kfree(cgrp); return err; } static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) { struct cgroup *c_parent = dentry->d_parent->d_fsdata; /* the vfs holds inode->i_mutex already */ return cgroup_create(c_parent, dentry, mode | S_IFDIR); } static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the * cgroup, if the css refcount is also 0, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since * we can be called via check_for_release() with no * synchronization other than RCU, and the subsystem linked * list isn't RCU-safe */ int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; /* Skip subsystems not in this hierarchy */ if (ss->root != cgrp->root) continue; css = cgrp->subsys[ss->subsys_id]; /* When called from check_for_release() it's possible * that by this point the cgroup has been removed * and the css deleted. But a false-positive doesn't * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ if (css && atomic_read(&css->refcnt)) return 1; } return 0; } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; struct super_block *sb; struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); if (atomic_read(&cgrp->count) != 0) { mutex_unlock(&cgroup_mutex); return -EBUSY; } if (!list_empty(&cgrp->children)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } mutex_unlock(&cgroup_mutex); /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); mutex_lock(&cgroup_mutex); parent = cgrp->parent; root = cgrp->root; sb = root->sb; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) || cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); /* delete my sibling from parent->children */ list_del(&cgrp->sibling); spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); mutex_unlock(&cgroup_mutex); return 0; } static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; } /** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init. */ int __init cgroup_init_early(void) { int i; atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); list_add(&rootnode.root_list, &roots); root_count = 1; init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->create); BUG_ON(!ss->destroy); if (ss->subsys_id != i) { printk(KERN_ERR "cgroup: Subsys %s id == %d\n", ss->name, ss->subsys_id); BUG(); } if (ss->early_init) cgroup_init_subsys(ss); } return 0; } /** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init. */ int __init cgroup_init(void) { int err; int i; struct hlist_head *hhead; err = bdi_init(&cgroup_backing_dev_info); if (err) return err; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!ss->early_init) cgroup_init_subsys(ss); } /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); out: if (err) bdi_destroy(&cgroup_backing_dev_info); return err; } /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. * - No need to task_lock(tsk) on this tsk->cgroup reference, as it * doesn't really matter if tsk->cgroup changes after we read it, * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it * anyway. No need to check that tsk->cgroup != NULL, thanks to * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks * cgroup to top_cgroup. */ /* TODO: Use a proper seq_file iterator */ static int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; char *buf; int retval; struct cgroupfs_root *root; retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) goto out; retval = -ESRCH; pid = m->private; tsk = get_pid_task(pid, PIDTYPE_PID); if (!tsk) goto out_free; retval = 0; mutex_lock(&cgroup_mutex); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int subsys_id; int count = 0; /* Skip this hierarchy if it has no active subsystems */ if (!root->actual_subsys_bits) continue; seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); seq_putc(m, ':'); get_first_subsys(&root->top_cgroup, NULL, &subsys_id); cgrp = task_cgroup(tsk, subsys_id); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); } out_unlock: mutex_unlock(&cgroup_mutex); put_task_struct(tsk); out_free: kfree(buf); out: return retval; } static int cgroup_open(struct inode *inode, struct file *file) { struct pid *pid = PROC_I(inode)->pid; return single_open(file, proc_cgroup_show, pid); } struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; seq_printf(m, "%s\t%lu\t%d\t%d\n", ss->name, ss->root->subsys_bits, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); return 0; } static int cgroupstats_open(struct inode *inode, struct file *file) { return single_open(file, proc_cgroupstats_show, NULL); } static struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /** * cgroup_fork - attach newly forked task to its parents cgroup. * @child: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cgroup at fork(). * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since * it was not made under the protection of RCU or cgroup_mutex, so * might no longer be a valid cgroup pointer. cgroup_attach_task() might * have already changed current->cgroups, allowing the previously * referenced cgroup group to be removed and freed. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } /** * cgroup_fork_callbacks - run fork callbacks * @child: the new task * * Called on a new task very soon before adding it to the * tasklist. No need to take any locks since no-one can * be operating on this task. */ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) ss->fork(ss, child); } } } #ifdef CONFIG_MM_OWNER /** * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes * @p: the new owner * * Called on every change to mm->owner. mm_init_owner() does not * invoke this routine, since it assigns the mm->owner the first time * and does not change it. * * The callbacks are invoked with mmap_sem held in read mode. */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); if (new) newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); } } } #endif /* CONFIG_MM_OWNER */ /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * * Adds the task to the list running through its css_set if necessary. * Has to be after the task is visible on the task list in case we race * with the first call to cgroup_iter_start() - to guarantee that the * new task ends up on its list. */ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); write_unlock(&css_set_lock); } } /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * * Note that cgroups marked notify_on_release force every task in * them to take the global cgroup_mutex mutex when exiting. * This could impact scaling on very large systems. Be reluctant to * use notify_on_release cgroups where very high task exit scaling * is required on large systems. * * the_top_cgroup_hack: * * Set the exiting tasks cgroup to the root cgroup (top_cgroup). * * We call cgroup_exit() while the task is still competent to * handle notify_on_release(), then leave the task attached to the * root cgroup in each hierarchy for the remainder of its exit. * * To do this properly, we would increment the reference count on * top_cgroup, and near the very end of the kernel/exit.c do_exit() * code we would add a second cgroup function call, to drop that * reference. This would just create an unnecessary hot spot on * the top_cgroup reference count, to no avail. * * Normally, holding a reference to a cgroup without bumping its * count is unsafe. The cgroup could go away, or someone could * attach us to a different cgroup, decrementing the count on * the first cgroup that we never incremented. But in this case, * top_cgroup isn't going away, and either task has PF_EXITING set, * which wards off any cgroup_attach_task() attempts, or task is a failed * fork, never visible to cgroup_attach_task. */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { int i; struct css_set *cg; if (run_callbacks && need_forkexit_callback) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->exit) ss->exit(ss, tsk); } } /* * Unlink from the css_set task list if necessary. * Optimistically check cg_list before taking * css_set_lock */ if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) list_del(&tsk->cg_list); write_unlock(&css_set_lock); } /* Reassign the task to the init_css_set. */ task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; task_unlock(tsk); if (cg) put_css_set_taskexit(cg); } /** * cgroup_clone - clone the cgroup the given subsystem is attached to * @tsk: the task to be moved * @subsys: the given subsystem * @nodename: the name for the new cgroup * * Duplicate the current cgroup in the hierarchy that the given * subsystem is attached to, and move this task into the new * child. */ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, char *nodename) { struct dentry *dentry; int ret = 0; struct cgroup *parent, *child; struct inode *inode; struct css_set *cg; struct cgroupfs_root *root; struct cgroup_subsys *ss; /* We shouldn't be called by an unregistered subsystem */ BUG_ON(!subsys->active); /* First figure out what hierarchy and cgroup we're dealing * with, and pin them so we can drop cgroup_mutex */ mutex_lock(&cgroup_mutex); again: root = subsys->root; if (root == &rootnode) { mutex_unlock(&cgroup_mutex); return 0; } cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); /* Keep the cgroup alive */ get_css_set(cg); mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ inode = parent->dentry->d_inode; /* Hold the parent directory mutex across this operation to * stop anyone else deleting the new cgroup */ mutex_lock(&inode->i_mutex); dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); if (IS_ERR(dentry)) { printk(KERN_INFO "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, PTR_ERR(dentry)); ret = PTR_ERR(dentry); goto out_release; } /* Create the cgroup directory, which also creates the cgroup */ ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); child = __d_cgrp(dentry); dput(dentry); if (ret) { printk(KERN_INFO "Failed to create cgroup %s: %d\n", nodename, ret); goto out_release; } if (!child) { printk(KERN_INFO "Couldn't find new cgroup %s\n", nodename); ret = -ENOMEM; goto out_release; } /* The cgroup now exists. Retake cgroup_mutex and check * that we're still in the same state that we thought we * were. */ mutex_lock(&cgroup_mutex); if ((root != subsys->root) || (parent != task_cgroup(tsk, subsys->subsys_id))) { /* Aargh, we raced ... */ mutex_unlock(&inode->i_mutex); put_css_set(cg); deactivate_super(parent->root->sb); /* The cgroup is still accessible in the VFS, but * we're not going to try to rmdir() it at this * point. */ printk(KERN_INFO "Race in cgroup_clone() - leaking cgroup %s\n", nodename); goto again; } /* do any required auto-setup */ for_each_subsys(root, ss) { if (ss->post_clone) ss->post_clone(ss, child); } /* All seems fine. Finish by moving the task into the new cgroup */ ret = cgroup_attach_task(child, tsk); mutex_unlock(&cgroup_mutex); out_release: mutex_unlock(&inode->i_mutex); mutex_lock(&cgroup_mutex); put_css_set(cg); mutex_unlock(&cgroup_mutex); deactivate_super(parent->root->sb); return ret; } /** * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp * @cgrp: the cgroup in question * * See if @cgrp is a descendant of the current task's cgroup in * the appropriate hierarchy. * * If we are sending in dummytop, then presumably we are creating * the top cgroup in the subsystem. * * Called only by the ns (nsproxy) cgroup. */ int cgroup_is_descendant(const struct cgroup *cgrp) { int ret; struct cgroup *target; int subsys_id; if (cgrp == dummytop) return 1; get_first_subsys(cgrp, NULL, &subsys_id); target = task_cgroup(current, subsys_id); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); return ret; } static void check_for_release(struct cgroup *cgrp) { /* All of these checks rely on RCU to keep the cgroup * structure alive */ if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { /* Control Group is currently removeable. If it's not * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } } void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } rcu_read_unlock(); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; char *pathbuf = NULL, *agentbuf = NULL; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) goto continue_free; i = 0; argv[i++] = agentbuf; argv[i++] = pathbuf; argv[i] = NULL; i = 0; /* minimal command environment */ envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; /* Drop the lock while we invoke the usermode helper, * since the exec could involve hitting disk and hence * be a slow process */ mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); mutex_lock(&cgroup_mutex); continue_free: kfree(pathbuf); kfree(agentbuf); spin_lock(&release_list_lock); } spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } static int __init cgroup_disable(char *str) { int i; char *token; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" " subsystem\n", ss->name); break; } } } return 1; } __setup("cgroup_disable=", cgroup_disable);