aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/pnp/pnpacpi/rsparser.c
blob: 35bb44af49b37b44d0f3594173e453c76c49281d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
/*
 * pnpacpi -- PnP ACPI driver
 *
 * Copyright (c) 2004 Matthieu Castet <castet.matthieu@free.fr>
 * Copyright (c) 2004 Li Shaohua <shaohua.li@intel.com>
 * Copyright (C) 2008 Hewlett-Packard Development Company, L.P.
 *	Bjorn Helgaas <bjorn.helgaas@hp.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
 * later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/pnp.h>
#include <linux/slab.h>
#include "../base.h"
#include "pnpacpi.h"

#ifdef CONFIG_IA64
#define valid_IRQ(i) (1)
#else
#define valid_IRQ(i) (((i) != 0) && ((i) != 2))
#endif

/*
 * Allocated Resources
 */
static int irq_flags(int triggering, int polarity, int shareable)
{
	int flags;

	if (triggering == ACPI_LEVEL_SENSITIVE) {
		if (polarity == ACPI_ACTIVE_LOW)
			flags = IORESOURCE_IRQ_LOWLEVEL;
		else
			flags = IORESOURCE_IRQ_HIGHLEVEL;
	} else {
		if (polarity == ACPI_ACTIVE_LOW)
			flags = IORESOURCE_IRQ_LOWEDGE;
		else
			flags = IORESOURCE_IRQ_HIGHEDGE;
	}

	if (shareable == ACPI_SHARED)
		flags |= IORESOURCE_IRQ_SHAREABLE;

	return flags;
}

static void decode_irq_flags(struct pnp_dev *dev, int flags, int *triggering,
			     int *polarity, int *shareable)
{
	switch (flags & (IORESOURCE_IRQ_LOWLEVEL | IORESOURCE_IRQ_HIGHLEVEL |
			 IORESOURCE_IRQ_LOWEDGE  | IORESOURCE_IRQ_HIGHEDGE)) {
	case IORESOURCE_IRQ_LOWLEVEL:
		*triggering = ACPI_LEVEL_SENSITIVE;
		*polarity = ACPI_ACTIVE_LOW;
		break;
	case IORESOURCE_IRQ_HIGHLEVEL:
		*triggering = ACPI_LEVEL_SENSITIVE;
		*polarity = ACPI_ACTIVE_HIGH;
		break;
	case IORESOURCE_IRQ_LOWEDGE:
		*triggering = ACPI_EDGE_SENSITIVE;
		*polarity = ACPI_ACTIVE_LOW;
		break;
	case IORESOURCE_IRQ_HIGHEDGE:
		*triggering = ACPI_EDGE_SENSITIVE;
		*polarity = ACPI_ACTIVE_HIGH;
		break;
	default:
		dev_err(&dev->dev, "can't encode invalid IRQ mode %#x\n",
			flags);
		*triggering = ACPI_EDGE_SENSITIVE;
		*polarity = ACPI_ACTIVE_HIGH;
		break;
	}

	if (flags & IORESOURCE_IRQ_SHAREABLE)
		*shareable = ACPI_SHARED;
	else
		*shareable = ACPI_EXCLUSIVE;
}

static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
						u32 gsi, int triggering,
						int polarity, int shareable)
{
	int irq, flags;
	int p, t;

	if (!valid_IRQ(gsi)) {
		pnp_add_irq_resource(dev, gsi, IORESOURCE_DISABLED);
		return;
	}

	/*
	 * in IO-APIC mode, use overrided attribute. Two reasons:
	 * 1. BIOS bug in DSDT
	 * 2. BIOS uses IO-APIC mode Interrupt Source Override
	 */
	if (!acpi_get_override_irq(gsi, &t, &p)) {
		t = t ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
		p = p ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;

		if (triggering != t || polarity != p) {
			dev_warn(&dev->dev, "IRQ %d override to %s, %s\n",
				gsi, t ? "edge":"level", p ? "low":"high");
			triggering = t;
			polarity = p;
		}
	}

	flags = irq_flags(triggering, polarity, shareable);
	irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
	if (irq >= 0)
		pcibios_penalize_isa_irq(irq, 1);
	else
		flags |= IORESOURCE_DISABLED;

	pnp_add_irq_resource(dev, irq, flags);
}

static int dma_flags(struct pnp_dev *dev, int type, int bus_master,
		     int transfer)
{
	int flags = 0;

	if (bus_master)
		flags |= IORESOURCE_DMA_MASTER;
	switch (type) {
	case ACPI_COMPATIBILITY:
		flags |= IORESOURCE_DMA_COMPATIBLE;
		break;
	case ACPI_TYPE_A:
		flags |= IORESOURCE_DMA_TYPEA;
		break;
	case ACPI_TYPE_B:
		flags |= IORESOURCE_DMA_TYPEB;
		break;
	case ACPI_TYPE_F:
		flags |= IORESOURCE_DMA_TYPEF;
		break;
	default:
		/* Set a default value ? */
		flags |= IORESOURCE_DMA_COMPATIBLE;
		dev_err(&dev->dev, "invalid DMA type %d\n", type);
	}
	switch (transfer) {
	case ACPI_TRANSFER_8:
		flags |= IORESOURCE_DMA_8BIT;
		break;
	case ACPI_TRANSFER_8_16:
		flags |= IORESOURCE_DMA_8AND16BIT;
		break;
	case ACPI_TRANSFER_16:
		flags |= IORESOURCE_DMA_16BIT;
		break;
	default:
		/* Set a default value ? */
		flags |= IORESOURCE_DMA_8AND16BIT;
		dev_err(&dev->dev, "invalid DMA transfer type %d\n", transfer);
	}

	return flags;
}

static void pnpacpi_parse_allocated_ioresource(struct pnp_dev *dev, u64 start,
					       u64 len, int io_decode,
					       int window)
{
	int flags = 0;
	u64 end = start + len - 1;

	if (io_decode == ACPI_DECODE_16)
		flags |= IORESOURCE_IO_16BIT_ADDR;
	if (len == 0 || end >= 0x10003)
		flags |= IORESOURCE_DISABLED;
	if (window)
		flags |= IORESOURCE_WINDOW;

	pnp_add_io_resource(dev, start, end, flags);
}

/*
 * Device CSRs that do not appear in PCI config space should be described
 * via ACPI.  This would normally be done with Address Space Descriptors
 * marked as "consumer-only," but old versions of Windows and Linux ignore
 * the producer/consumer flag, so HP invented a vendor-defined resource to
 * describe the location and size of CSR space.
 */
static struct acpi_vendor_uuid hp_ccsr_uuid = {
	.subtype = 2,
	.data = { 0xf9, 0xad, 0xe9, 0x69, 0x4f, 0x92, 0x5f, 0xab, 0xf6, 0x4a,
	    0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad },
};

static int vendor_resource_matches(struct pnp_dev *dev,
				   struct acpi_resource_vendor_typed *vendor,
				   struct acpi_vendor_uuid *match,
				   int expected_len)
{
	int uuid_len = sizeof(vendor->uuid);
	u8 uuid_subtype = vendor->uuid_subtype;
	u8 *uuid = vendor->uuid;
	int actual_len;

	/* byte_length includes uuid_subtype and uuid */
	actual_len = vendor->byte_length - uuid_len - 1;

	if (uuid_subtype == match->subtype &&
	    uuid_len == sizeof(match->data) &&
	    memcmp(uuid, match->data, uuid_len) == 0) {
		if (expected_len && expected_len != actual_len) {
			dev_err(&dev->dev, "wrong vendor descriptor size; "
				"expected %d, found %d bytes\n",
				expected_len, actual_len);
			return 0;
		}

		return 1;
	}

	return 0;
}

static void pnpacpi_parse_allocated_vendor(struct pnp_dev *dev,
				    struct acpi_resource_vendor_typed *vendor)
{
	if (vendor_resource_matches(dev, vendor, &hp_ccsr_uuid, 16)) {
		u64 start, length;

		memcpy(&start, vendor->byte_data, sizeof(start));
		memcpy(&length, vendor->byte_data + 8, sizeof(length));

		pnp_add_mem_resource(dev, start, start + length - 1, 0);
	}
}

static void pnpacpi_parse_allocated_memresource(struct pnp_dev *dev,
						u64 start, u64 len,
						int write_protect, int window)
{
	int flags = 0;
	u64 end = start + len - 1;

	if (len == 0)
		flags |= IORESOURCE_DISABLED;
	if (write_protect == ACPI_READ_WRITE_MEMORY)
		flags |= IORESOURCE_MEM_WRITEABLE;
	if (window)
		flags |= IORESOURCE_WINDOW;

	pnp_add_mem_resource(dev, start, end, flags);
}

static void pnpacpi_parse_allocated_busresource(struct pnp_dev *dev,
						u64 start, u64 len)
{
	u64 end = start + len - 1;

	pnp_add_bus_resource(dev, start, end);
}

static u64 addr_space_length(struct pnp_dev *dev, u64 min, u64 max, u64 len)
{
	u64 max_len;

	max_len = max - min + 1;
	if (len <= max_len)
		return len;

	/*
	 * Per 6.4.3.5, _LEN cannot exceed _MAX - _MIN + 1, but some BIOSes
	 * don't do this correctly, e.g.,
	 * https://bugzilla.kernel.org/show_bug.cgi?id=15480
	 */
	dev_info(&dev->dev,
	         "resource length %#llx doesn't fit in %#llx-%#llx, trimming\n",
		 (unsigned long long) len, (unsigned long long) min,
		 (unsigned long long) max);
	return max_len;
}

static void pnpacpi_parse_allocated_address_space(struct pnp_dev *dev,
						  struct acpi_resource *res)
{
	struct acpi_resource_address64 addr, *p = &addr;
	acpi_status status;
	int window;
	u64 len;

	status = acpi_resource_to_address64(res, p);
	if (!ACPI_SUCCESS(status)) {
		dev_warn(&dev->dev, "failed to convert resource type %d\n",
			 res->type);
		return;
	}

	len = addr_space_length(dev, p->minimum, p->maximum, p->address_length);
	window = (p->producer_consumer == ACPI_PRODUCER) ? 1 : 0;

	if (p->resource_type == ACPI_MEMORY_RANGE)
		pnpacpi_parse_allocated_memresource(dev, p->minimum, len,
			p->info.mem.write_protect, window);
	else if (p->resource_type == ACPI_IO_RANGE)
		pnpacpi_parse_allocated_ioresource(dev, p->minimum, len,
			p->granularity == 0xfff ? ACPI_DECODE_10 :
				ACPI_DECODE_16, window);
	else if (p->resource_type == ACPI_BUS_NUMBER_RANGE)
		pnpacpi_parse_allocated_busresource(dev, p->minimum, len);
}

static void pnpacpi_parse_allocated_ext_address_space(struct pnp_dev *dev,
						      struct acpi_resource *res)
{
	struct acpi_resource_extended_address64 *p = &res->data.ext_address64;
	int window;
	u64 len;

	len = addr_space_length(dev, p->minimum, p->maximum, p->address_length);
	window = (p->producer_consumer == ACPI_PRODUCER) ? 1 : 0;

	if (p->resource_type == ACPI_MEMORY_RANGE)
		pnpacpi_parse_allocated_memresource(dev, p->minimum, len,
			p->info.mem.write_protect, window);
	else if (p->resource_type == ACPI_IO_RANGE)
		pnpacpi_parse_allocated_ioresource(dev, p->minimum, len,
			p->granularity == 0xfff ? ACPI_DECODE_10 :
				ACPI_DECODE_16, window);
	else if (p->resource_type == ACPI_BUS_NUMBER_RANGE)
		pnpacpi_parse_allocated_busresource(dev, p->minimum, len);
}

static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
					      void *data)
{
	struct pnp_dev *dev = data;
	struct acpi_resource_irq *irq;
	struct acpi_resource_dma *dma;
	struct acpi_resource_io *io;
	struct acpi_resource_fixed_io *fixed_io;
	struct acpi_resource_vendor_typed *vendor_typed;
	struct acpi_resource_memory24 *memory24;
	struct acpi_resource_memory32 *memory32;
	struct acpi_resource_fixed_memory32 *fixed_memory32;
	struct acpi_resource_extended_irq *extended_irq;
	int i, flags;

	switch (res->type) {
	case ACPI_RESOURCE_TYPE_IRQ:
		/*
		 * Per spec, only one interrupt per descriptor is allowed in
		 * _CRS, but some firmware violates this, so parse them all.
		 */
		irq = &res->data.irq;
		if (irq->interrupt_count == 0)
			pnp_add_irq_resource(dev, 0, IORESOURCE_DISABLED);
		else {
			for (i = 0; i < irq->interrupt_count; i++) {
				pnpacpi_parse_allocated_irqresource(dev,
					irq->interrupts[i],
					irq->triggering,
					irq->polarity,
				    irq->sharable);
			}

			/*
			 * The IRQ encoder puts a single interrupt in each
			 * descriptor, so if a _CRS descriptor has more than
			 * one interrupt, we won't be able to re-encode it.
			 */
			if (pnp_can_write(dev) && irq->interrupt_count > 1) {
				dev_warn(&dev->dev, "multiple interrupts in "
					 "_CRS descriptor; configuration can't "
					 "be changed\n");
				dev->capabilities &= ~PNP_WRITE;
			}
		}
		break;

	case ACPI_RESOURCE_TYPE_DMA:
		dma = &res->data.dma;
		if (dma->channel_count > 0 && dma->channels[0] != (u8) -1)
			flags = dma_flags(dev, dma->type, dma->bus_master,
					  dma->transfer);
		else
			flags = IORESOURCE_DISABLED;
		pnp_add_dma_resource(dev, dma->channels[0], flags);
		break;

	case ACPI_RESOURCE_TYPE_IO:
		io = &res->data.io;
		pnpacpi_parse_allocated_ioresource(dev,
			io->minimum,
			io->address_length,
			io->io_decode, 0);
		break;

	case ACPI_RESOURCE_TYPE_START_DEPENDENT:
	case ACPI_RESOURCE_TYPE_END_DEPENDENT:
		break;

	case ACPI_RESOURCE_TYPE_FIXED_IO:
		fixed_io = &res->data.fixed_io;
		pnpacpi_parse_allocated_ioresource(dev,
			fixed_io->address,
			fixed_io->address_length,
			ACPI_DECODE_10, 0);
		break;

	case ACPI_RESOURCE_TYPE_VENDOR:
		vendor_typed = &res->data.vendor_typed;
		pnpacpi_parse_allocated_vendor(dev, vendor_typed);
		break;

	case ACPI_RESOURCE_TYPE_END_TAG:
		break;

	case ACPI_RESOURCE_TYPE_MEMORY24:
		memory24 = &res->data.memory24;
		pnpacpi_parse_allocated_memresource(dev,
			memory24->minimum,
			memory24->address_length,
			memory24->write_protect, 0);
		break;
	case ACPI_RESOURCE_TYPE_MEMORY32:
		memory32 = &res->data.memory32;
		pnpacpi_parse_allocated_memresource(dev,
			memory32->minimum,
			memory32->address_length,
			memory32->write_protect, 0);
		break;
	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
		fixed_memory32 = &res->data.fixed_memory32;
		pnpacpi_parse_allocated_memresource(dev,
			fixed_memory32->address,
			fixed_memory32->address_length,
			fixed_memory32->write_protect, 0);
		break;
	case ACPI_RESOURCE_TYPE_ADDRESS16:
	case ACPI_RESOURCE_TYPE_ADDRESS32:
	case ACPI_RESOURCE_TYPE_ADDRESS64:
		pnpacpi_parse_allocated_address_space(dev, res);
		break;

	case ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64:
		pnpacpi_parse_allocated_ext_address_space(dev, res);
		break;

	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
		extended_irq = &res->data.extended_irq;

		if (extended_irq->interrupt_count == 0)
			pnp_add_irq_resource(dev, 0, IORESOURCE_DISABLED);
		else {
			for (i = 0; i < extended_irq->interrupt_count; i++) {
				pnpacpi_parse_allocated_irqresource(dev,
					extended_irq->interrupts[i],
					extended_irq->triggering,
					extended_irq->polarity,
					extended_irq->sharable);
			}

			/*
			 * The IRQ encoder puts a single interrupt in each
			 * descriptor, so if a _CRS descriptor has more than
			 * one interrupt, we won't be able to re-encode it.
			 */
			if (pnp_can_write(dev) &&
			    extended_irq->interrupt_count > 1) {
				dev_warn(&dev->dev, "multiple interrupts in "
					 "_CRS descriptor; configuration can't "
					 "be changed\n");
				dev->capabilities &= ~PNP_WRITE;
			}
		}
		break;

	case ACPI_RESOURCE_TYPE_GENERIC_REGISTER:
		break;

	default:
		dev_warn(&dev->dev, "unknown resource type %d in _CRS\n",
			 res->type);
		return AE_ERROR;
	}

	return AE_OK;
}

int pnpacpi_parse_allocated_resource(struct pnp_dev *dev)
{
	struct acpi_device *acpi_dev = dev->data;
	acpi_handle handle = acpi_dev->handle;
	acpi_status status;

	pnp_dbg(&dev->dev, "parse allocated resources\n");

	pnp_init_resources(dev);

	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
				     pnpacpi_allocated_resource, dev);

	if (ACPI_FAILURE(status)) {
		if (status != AE_NOT_FOUND)
			dev_err(&dev->dev, "can't evaluate _CRS: %d", status);
		return -EPERM;
	}
	return 0;
}

static __init void pnpacpi_parse_dma_option(struct pnp_dev *dev,
					    unsigned int option_flags,
					    struct acpi_resource_dma *p)
{
	int i;
	unsigned char map = 0, flags;

	if (p->channel_count == 0)
		return;

	for (i = 0; i < p->channel_count; i++)
		map |= 1 << p->channels[i];

	flags = dma_flags(dev, p->type, p->bus_master, p->transfer);
	pnp_register_dma_resource(dev, option_flags, map, flags);
}

static __init void pnpacpi_parse_irq_option(struct pnp_dev *dev,
					    unsigned int option_flags,
					    struct acpi_resource_irq *p)
{
	int i;
	pnp_irq_mask_t map;
	unsigned char flags;

	if (p->interrupt_count == 0)
		return;

	bitmap_zero(map.bits, PNP_IRQ_NR);
	for (i = 0; i < p->interrupt_count; i++)
		if (p->interrupts[i])
			__set_bit(p->interrupts[i], map.bits);

	flags = irq_flags(p->triggering, p->polarity, p->sharable);
	pnp_register_irq_resource(dev, option_flags, &map, flags);
}

static __init void pnpacpi_parse_ext_irq_option(struct pnp_dev *dev,
					unsigned int option_flags,
					struct acpi_resource_extended_irq *p)
{
	int i;
	pnp_irq_mask_t map;
	unsigned char flags;

	if (p->interrupt_count == 0)
		return;

	bitmap_zero(map.bits, PNP_IRQ_NR);
	for (i = 0; i < p->interrupt_count; i++) {
		if (p->interrupts[i]) {
			if (p->interrupts[i] < PNP_IRQ_NR)
				__set_bit(p->interrupts[i], map.bits);
			else
				dev_err(&dev->dev, "ignoring IRQ %d option "
					"(too large for %d entry bitmap)\n",
					p->interrupts[i], PNP_IRQ_NR);
		}
	}

	flags = irq_flags(p->triggering, p->polarity, p->sharable);
	pnp_register_irq_resource(dev, option_flags, &map, flags);
}

static __init void pnpacpi_parse_port_option(struct pnp_dev *dev,
					     unsigned int option_flags,
					     struct acpi_resource_io *io)
{
	unsigned char flags = 0;

	if (io->address_length == 0)
		return;

	if (io->io_decode == ACPI_DECODE_16)
		flags = IORESOURCE_IO_16BIT_ADDR;
	pnp_register_port_resource(dev, option_flags, io->minimum, io->maximum,
				   io->alignment, io->address_length, flags);
}

static __init void pnpacpi_parse_fixed_port_option(struct pnp_dev *dev,
					unsigned int option_flags,
					struct acpi_resource_fixed_io *io)
{
	if (io->address_length == 0)
		return;

	pnp_register_port_resource(dev, option_flags, io->address, io->address,
				   0, io->address_length, IORESOURCE_IO_FIXED);
}

static __init void pnpacpi_parse_mem24_option(struct pnp_dev *dev,
					      unsigned int option_flags,
					      struct acpi_resource_memory24 *p)
{
	unsigned char flags = 0;

	if (p->address_length == 0)
		return;

	if (p->write_protect == ACPI_READ_WRITE_MEMORY)
		flags = IORESOURCE_MEM_WRITEABLE;
	pnp_register_mem_resource(dev, option_flags, p->minimum, p->maximum,
				  p->alignment, p->address_length, flags);
}

static __init void pnpacpi_parse_mem32_option(struct pnp_dev *dev,
					      unsigned int option_flags,
					      struct acpi_resource_memory32 *p)
{
	unsigned char flags = 0;

	if (p->address_length == 0)
		return;

	if (p->write_protect == ACPI_READ_WRITE_MEMORY)
		flags = IORESOURCE_MEM_WRITEABLE;
	pnp_register_mem_resource(dev, option_flags, p->minimum, p->maximum,
				  p->alignment, p->address_length, flags);
}

static __init void pnpacpi_parse_fixed_mem32_option(struct pnp_dev *dev,
					unsigned int option_flags,
					struct acpi_resource_fixed_memory32 *p)
{
	unsigned char flags = 0;

	if (p->address_length == 0)
		return;

	if (p->write_protect == ACPI_READ_WRITE_MEMORY)
		flags = IORESOURCE_MEM_WRITEABLE;
	pnp_register_mem_resource(dev, option_flags, p->address, p->address,
				  0, p->address_length, flags);
}

static __init void pnpacpi_parse_address_option(struct pnp_dev *dev,
						unsigned int option_flags,
						struct acpi_resource *r)
{
	struct acpi_resource_address64 addr, *p = &addr;
	acpi_status status;
	unsigned char flags = 0;

	status = acpi_resource_to_address64(r, p);
	if (ACPI_FAILURE(status)) {
		dev_warn(&dev->dev, "can't convert resource type %d\n",
			 r->type);
		return;
	}

	if (p->address_length == 0)
		return;

	if (p->resource_type == ACPI_MEMORY_RANGE) {
		if (p->info.mem.write_protect == ACPI_READ_WRITE_MEMORY)
			flags = IORESOURCE_MEM_WRITEABLE;
		pnp_register_mem_resource(dev, option_flags, p->minimum,
					  p->minimum, 0, p->address_length,
					  flags);
	} else if (p->resource_type == ACPI_IO_RANGE)
		pnp_register_port_resource(dev, option_flags, p->minimum,
					   p->minimum, 0, p->address_length,
					   IORESOURCE_IO_FIXED);
}

static __init void pnpacpi_parse_ext_address_option(struct pnp_dev *dev,
						    unsigned int option_flags,
						    struct acpi_resource *r)
{
	struct acpi_resource_extended_address64 *p = &r->data.ext_address64;
	unsigned char flags = 0;

	if (p->address_length == 0)
		return;

	if (p->resource_type == ACPI_MEMORY_RANGE) {
		if (p->info.mem.write_protect == ACPI_READ_WRITE_MEMORY)
			flags = IORESOURCE_MEM_WRITEABLE;
		pnp_register_mem_resource(dev, option_flags, p->minimum,
					  p->minimum, 0, p->address_length,
					  flags);
	} else if (p->resource_type == ACPI_IO_RANGE)
		pnp_register_port_resource(dev, option_flags, p->minimum,
					   p->minimum, 0, p->address_length,
					   IORESOURCE_IO_FIXED);
}

struct acpipnp_parse_option_s {
	struct pnp_dev *dev;
	unsigned int option_flags;
};

static __init acpi_status pnpacpi_option_resource(struct acpi_resource *res,
						  void *data)
{
	int priority;
	struct acpipnp_parse_option_s *parse_data = data;
	struct pnp_dev *dev = parse_data->dev;
	unsigned int option_flags = parse_data->option_flags;

	switch (res->type) {
	case ACPI_RESOURCE_TYPE_IRQ:
		pnpacpi_parse_irq_option(dev, option_flags, &res->data.irq);
		break;

	case ACPI_RESOURCE_TYPE_DMA:
		pnpacpi_parse_dma_option(dev, option_flags, &res->data.dma);
		break;

	case ACPI_RESOURCE_TYPE_START_DEPENDENT:
		switch (res->data.start_dpf.compatibility_priority) {
		case ACPI_GOOD_CONFIGURATION:
			priority = PNP_RES_PRIORITY_PREFERRED;
			break;

		case ACPI_ACCEPTABLE_CONFIGURATION:
			priority = PNP_RES_PRIORITY_ACCEPTABLE;
			break;

		case ACPI_SUB_OPTIMAL_CONFIGURATION:
			priority = PNP_RES_PRIORITY_FUNCTIONAL;
			break;
		default:
			priority = PNP_RES_PRIORITY_INVALID;
			break;
		}
		parse_data->option_flags = pnp_new_dependent_set(dev, priority);
		break;

	case ACPI_RESOURCE_TYPE_END_DEPENDENT:
		parse_data->option_flags = 0;
		break;

	case ACPI_RESOURCE_TYPE_IO:
		pnpacpi_parse_port_option(dev, option_flags, &res->data.io);
		break;

	case ACPI_RESOURCE_TYPE_FIXED_IO:
		pnpacpi_parse_fixed_port_option(dev, option_flags,
					        &res->data.fixed_io);
		break;

	case ACPI_RESOURCE_TYPE_VENDOR:
	case ACPI_RESOURCE_TYPE_END_TAG:
		break;

	case ACPI_RESOURCE_TYPE_MEMORY24:
		pnpacpi_parse_mem24_option(dev, option_flags,
					   &res->data.memory24);
		break;

	case ACPI_RESOURCE_TYPE_MEMORY32:
		pnpacpi_parse_mem32_option(dev, option_flags,
					   &res->data.memory32);
		break;

	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
		pnpacpi_parse_fixed_mem32_option(dev, option_flags,
						 &res->data.fixed_memory32);
		break;

	case ACPI_RESOURCE_TYPE_ADDRESS16:
	case ACPI_RESOURCE_TYPE_ADDRESS32:
	case ACPI_RESOURCE_TYPE_ADDRESS64:
		pnpacpi_parse_address_option(dev, option_flags, res);
		break;

	case ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64:
		pnpacpi_parse_ext_address_option(dev, option_flags, res);
		break;

	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
		pnpacpi_parse_ext_irq_option(dev, option_flags,
					     &res->data.extended_irq);
		break;

	case ACPI_RESOURCE_TYPE_GENERIC_REGISTER:
		break;

	default:
		dev_warn(&dev->dev, "unknown resource type %d in _PRS\n",
			 res->type);
		return AE_ERROR;
	}

	return AE_OK;
}

int __init pnpacpi_parse_resource_option_data(struct pnp_dev *dev)
{
	struct acpi_device *acpi_dev = dev->data;
	acpi_handle handle = acpi_dev->handle;
	acpi_status status;
	struct acpipnp_parse_option_s parse_data;

	pnp_dbg(&dev->dev, "parse resource options\n");

	parse_data.dev = dev;
	parse_data.option_flags = 0;

	status = acpi_walk_resources(handle, METHOD_NAME__PRS,
				     pnpacpi_option_resource, &parse_data);

	if (ACPI_FAILURE(status)) {
		if (status != AE_NOT_FOUND)
			dev_err(&dev->dev, "can't evaluate _PRS: %d", status);
		return -EPERM;
	}
	return 0;
}

static int pnpacpi_supported_resource(struct acpi_resource *res)
{
	switch (res->type) {
	case ACPI_RESOURCE_TYPE_IRQ:
	case ACPI_RESOURCE_TYPE_DMA:
	case ACPI_RESOURCE_TYPE_IO:
	case ACPI_RESOURCE_TYPE_FIXED_IO:
	case ACPI_RESOURCE_TYPE_MEMORY24:
	case ACPI_RESOURCE_TYPE_MEMORY32:
	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
	case ACPI_RESOURCE_TYPE_ADDRESS16:
	case ACPI_RESOURCE_TYPE_ADDRESS32:
	case ACPI_RESOURCE_TYPE_ADDRESS64:
	case ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64:
	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
		return 1;
	}
	return 0;
}

/*
 * Set resource
 */
static acpi_status pnpacpi_count_resources(struct acpi_resource *res,
					   void *data)
{
	int *res_cnt = data;

	if (pnpacpi_supported_resource(res))
		(*res_cnt)++;
	return AE_OK;
}

static acpi_status pnpacpi_type_resources(struct acpi_resource *res, void *data)
{
	struct acpi_resource **resource = data;

	if (pnpacpi_supported_resource(res)) {
		(*resource)->type = res->type;
		(*resource)->length = sizeof(struct acpi_resource);
		if (res->type == ACPI_RESOURCE_TYPE_IRQ)
			(*resource)->data.irq.descriptor_length =
					res->data.irq.descriptor_length;
		(*resource)++;
	}

	return AE_OK;
}

int pnpacpi_build_resource_template(struct pnp_dev *dev,
				    struct acpi_buffer *buffer)
{
	struct acpi_device *acpi_dev = dev->data;
	acpi_handle handle = acpi_dev->handle;
	struct acpi_resource *resource;
	int res_cnt = 0;
	acpi_status status;

	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
				     pnpacpi_count_resources, &res_cnt);
	if (ACPI_FAILURE(status)) {
		dev_err(&dev->dev, "can't evaluate _CRS: %d\n", status);
		return -EINVAL;
	}
	if (!res_cnt)
		return -EINVAL;
	buffer->length = sizeof(struct acpi_resource) * (res_cnt + 1) + 1;
	buffer->pointer = kzalloc(buffer->length - 1, GFP_KERNEL);
	if (!buffer->pointer)
		return -ENOMEM;

	resource = (struct acpi_resource *)buffer->pointer;
	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
				     pnpacpi_type_resources, &resource);
	if (ACPI_FAILURE(status)) {
		kfree(buffer->pointer);
		dev_err(&dev->dev, "can't evaluate _CRS: %d\n", status);
		return -EINVAL;
	}
	/* resource will pointer the end resource now */
	resource->type = ACPI_RESOURCE_TYPE_END_TAG;

	return 0;
}

static void pnpacpi_encode_irq(struct pnp_dev *dev,
			       struct acpi_resource *resource,
			       struct resource *p)
{
	struct acpi_resource_irq *irq = &resource->data.irq;
	int triggering, polarity, shareable;

	if (!pnp_resource_enabled(p)) {
		irq->interrupt_count = 0;
		pnp_dbg(&dev->dev, "  encode irq (%s)\n",
			p ? "disabled" : "missing");
		return;
	}

	decode_irq_flags(dev, p->flags, &triggering, &polarity, &shareable);
	irq->triggering = triggering;
	irq->polarity = polarity;
	irq->sharable = shareable;
	irq->interrupt_count = 1;
	irq->interrupts[0] = p->start;

	pnp_dbg(&dev->dev, "  encode irq %d %s %s %s (%d-byte descriptor)\n",
		(int) p->start,
		triggering == ACPI_LEVEL_SENSITIVE ? "level" : "edge",
		polarity == ACPI_ACTIVE_LOW ? "low" : "high",
		irq->sharable == ACPI_SHARED ? "shared" : "exclusive",
		irq->descriptor_length);
}

static void pnpacpi_encode_ext_irq(struct pnp_dev *dev,
				   struct acpi_resource *resource,
				   struct resource *p)
{
	struct acpi_resource_extended_irq *extended_irq = &resource->data.extended_irq;
	int triggering, polarity, shareable;

	if (!pnp_resource_enabled(p)) {
		extended_irq->interrupt_count = 0;
		pnp_dbg(&dev->dev, "  encode extended irq (%s)\n",
			p ? "disabled" : "missing");
		return;
	}

	decode_irq_flags(dev, p->flags, &triggering, &polarity, &shareable);
	extended_irq->producer_consumer = ACPI_CONSUMER;
	extended_irq->triggering = triggering;
	extended_irq->polarity = polarity;
	extended_irq->sharable = shareable;
	extended_irq->interrupt_count = 1;
	extended_irq->interrupts[0] = p->start;

	pnp_dbg(&dev->dev, "  encode irq %d %s %s %s\n", (int) p->start,
		triggering == ACPI_LEVEL_SENSITIVE ? "level" : "edge",
		polarity == ACPI_ACTIVE_LOW ? "low" : "high",
		extended_irq->sharable == ACPI_SHARED ? "shared" : "exclusive");
}

static void pnpacpi_encode_dma(struct pnp_dev *dev,
			       struct acpi_resource *resource,
			       struct resource *p)
{
	struct acpi_resource_dma *dma = &resource->data.dma;

	if (!pnp_resource_enabled(p)) {
		dma->channel_count = 0;
		pnp_dbg(&dev->dev, "  encode dma (%s)\n",
			p ? "disabled" : "missing");
		return;
	}

	/* Note: pnp_assign_dma will copy pnp_dma->flags into p->flags */
	switch (p->flags & IORESOURCE_DMA_SPEED_MASK) {
	case IORESOURCE_DMA_TYPEA:
		dma->type = ACPI_TYPE_A;
		break;
	case IORESOURCE_DMA_TYPEB:
		dma->type = ACPI_TYPE_B;
		break;
	case IORESOURCE_DMA_TYPEF:
		dma->type = ACPI_TYPE_F;
		break;
	default:
		dma->type = ACPI_COMPATIBILITY;
	}

	switch (p->flags & IORESOURCE_DMA_TYPE_MASK) {
	case IORESOURCE_DMA_8BIT:
		dma->transfer = ACPI_TRANSFER_8;
		break;
	case IORESOURCE_DMA_8AND16BIT:
		dma->transfer = ACPI_TRANSFER_8_16;
		break;
	default:
		dma->transfer = ACPI_TRANSFER_16;
	}

	dma->bus_master = !!(p->flags & IORESOURCE_DMA_MASTER);
	dma->channel_count = 1;
	dma->channels[0] = p->start;

	pnp_dbg(&dev->dev, "  encode dma %d "
		"type %#x transfer %#x master %d\n",
		(int) p->start, dma->type, dma->transfer, dma->bus_master);
}

static void pnpacpi_encode_io(struct pnp_dev *dev,
			      struct acpi_resource *resource,
			      struct resource *p)
{
	struct acpi_resource_io *io = &resource->data.io;

	if (pnp_resource_enabled(p)) {
		/* Note: pnp_assign_port copies pnp_port->flags into p->flags */
		io->io_decode = (p->flags & IORESOURCE_IO_16BIT_ADDR) ?
		    ACPI_DECODE_16 : ACPI_DECODE_10;
		io->minimum = p->start;
		io->maximum = p->end;
		io->alignment = 0;	/* Correct? */
		io->address_length = p->end - p->start + 1;
	} else {
		io->minimum = 0;
		io->address_length = 0;
	}

	pnp_dbg(&dev->dev, "  encode io %#x-%#x decode %#x\n", io->minimum,
		io->minimum + io->address_length - 1, io->io_decode);
}

static void pnpacpi_encode_fixed_io(struct pnp_dev *dev,
				    struct acpi_resource *resource,
				    struct resource *p)
{
	struct acpi_resource_fixed_io *fixed_io = &resource->data.fixed_io;

	if (pnp_resource_enabled(p)) {
		fixed_io->address = p->start;
		fixed_io->address_length = p->end - p->start + 1;
	} else {
		fixed_io->address = 0;
		fixed_io->address_length = 0;
	}

	pnp_dbg(&dev->dev, "  encode fixed_io %#x-%#x\n", fixed_io->address,
		fixed_io->address + fixed_io->address_length - 1);
}

static void pnpacpi_encode_mem24(struct pnp_dev *dev,
				 struct acpi_resource *resource,
				 struct resource *p)
{
	struct acpi_resource_memory24 *memory24 = &resource->data.memory24;

	if (pnp_resource_enabled(p)) {
		/* Note: pnp_assign_mem copies pnp_mem->flags into p->flags */
		memory24->write_protect = p->flags & IORESOURCE_MEM_WRITEABLE ?
		    ACPI_READ_WRITE_MEMORY : ACPI_READ_ONLY_MEMORY;
		memory24->minimum = p->start;
		memory24->maximum = p->end;
		memory24->alignment = 0;
		memory24->address_length = p->end - p->start + 1;
	} else {
		memory24->minimum = 0;
		memory24->address_length = 0;
	}

	pnp_dbg(&dev->dev, "  encode mem24 %#x-%#x write_protect %#x\n",
		memory24->minimum,
		memory24->minimum + memory24->address_length - 1,
		memory24->write_protect);
}

static void pnpacpi_encode_mem32(struct pnp_dev *dev,
				 struct acpi_resource *resource,
				 struct resource *p)
{
	struct acpi_resource_memory32 *memory32 = &resource->data.memory32;

	if (pnp_resource_enabled(p)) {
		memory32->write_protect = p->flags & IORESOURCE_MEM_WRITEABLE ?
		    ACPI_READ_WRITE_MEMORY : ACPI_READ_ONLY_MEMORY;
		memory32->minimum = p->start;
		memory32->maximum = p->end;
		memory32->alignment = 0;
		memory32->address_length = p->end - p->start + 1;
	} else {
		memory32->minimum = 0;
		memory32->alignment = 0;
	}

	pnp_dbg(&dev->dev, "  encode mem32 %#x-%#x write_protect %#x\n",
		memory32->minimum,
		memory32->minimum + memory32->address_length - 1,
		memory32->write_protect);
}

static void pnpacpi_encode_fixed_mem32(struct pnp_dev *dev,
				       struct acpi_resource *resource,
				       struct resource *p)
{
	struct acpi_resource_fixed_memory32 *fixed_memory32 = &resource->data.fixed_memory32;

	if (pnp_resource_enabled(p)) {
		fixed_memory32->write_protect =
		    p->flags & IORESOURCE_MEM_WRITEABLE ?
		    ACPI_READ_WRITE_MEMORY : ACPI_READ_ONLY_MEMORY;
		fixed_memory32->address = p->start;
		fixed_memory32->address_length = p->end - p->start + 1;
	} else {
		fixed_memory32->address = 0;
		fixed_memory32->address_length = 0;
	}

	pnp_dbg(&dev->dev, "  encode fixed_mem32 %#x-%#x write_protect %#x\n",
		fixed_memory32->address,
		fixed_memory32->address + fixed_memory32->address_length - 1,
		fixed_memory32->write_protect);
}

int pnpacpi_encode_resources(struct pnp_dev *dev, struct acpi_buffer *buffer)
{
	int i = 0;
	/* pnpacpi_build_resource_template allocates extra mem */
	int res_cnt = (buffer->length - 1) / sizeof(struct acpi_resource) - 1;
	struct acpi_resource *resource = buffer->pointer;
	int port = 0, irq = 0, dma = 0, mem = 0;

	pnp_dbg(&dev->dev, "encode %d resources\n", res_cnt);
	while (i < res_cnt) {
		switch (resource->type) {
		case ACPI_RESOURCE_TYPE_IRQ:
			pnpacpi_encode_irq(dev, resource,
			       pnp_get_resource(dev, IORESOURCE_IRQ, irq));
			irq++;
			break;

		case ACPI_RESOURCE_TYPE_DMA:
			pnpacpi_encode_dma(dev, resource,
				pnp_get_resource(dev, IORESOURCE_DMA, dma));
			dma++;
			break;
		case ACPI_RESOURCE_TYPE_IO:
			pnpacpi_encode_io(dev, resource,
				pnp_get_resource(dev, IORESOURCE_IO, port));
			port++;
			break;
		case ACPI_RESOURCE_TYPE_FIXED_IO:
			pnpacpi_encode_fixed_io(dev, resource,
				pnp_get_resource(dev, IORESOURCE_IO, port));
			port++;
			break;
		case ACPI_RESOURCE_TYPE_MEMORY24:
			pnpacpi_encode_mem24(dev, resource,
				pnp_get_resource(dev, IORESOURCE_MEM, mem));
			mem++;
			break;
		case ACPI_RESOURCE_TYPE_MEMORY32:
			pnpacpi_encode_mem32(dev, resource,
				pnp_get_resource(dev, IORESOURCE_MEM, mem));
			mem++;
			break;
		case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
			pnpacpi_encode_fixed_mem32(dev, resource,
				pnp_get_resource(dev, IORESOURCE_MEM, mem));
			mem++;
			break;
		case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
			pnpacpi_encode_ext_irq(dev, resource,
				pnp_get_resource(dev, IORESOURCE_IRQ, irq));
			irq++;
			break;
		case ACPI_RESOURCE_TYPE_START_DEPENDENT:
		case ACPI_RESOURCE_TYPE_END_DEPENDENT:
		case ACPI_RESOURCE_TYPE_VENDOR:
		case ACPI_RESOURCE_TYPE_END_TAG:
		case ACPI_RESOURCE_TYPE_ADDRESS16:
		case ACPI_RESOURCE_TYPE_ADDRESS32:
		case ACPI_RESOURCE_TYPE_ADDRESS64:
		case ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64:
		case ACPI_RESOURCE_TYPE_GENERIC_REGISTER:
		default:	/* other type */
			dev_warn(&dev->dev, "can't encode unknown resource "
				 "type %d\n", resource->type);
			return -EINVAL;
		}
		resource++;
		i++;
	}
	return 0;
}
l kwb">static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { return calc_delta_fair(sched_slice(cfs_rq, se), se); } /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; if (unlikely(!curr)) return; /* * Get the amount of time the current task was running * since the last time we changed load (this cannot * overflow on 32 bits): */ delta_exec = (unsigned long)(now - curr->exec_start); if (!delta_exec) return; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } } static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } /* * Task is being enqueued - update stats: */ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); } static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->wait_max, max(se->wait_max, rq_of(cfs_rq)->clock - se->wait_start)); schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), rq_of(cfs_rq)->clock - se->wait_start); } #endif schedstat_set(se->wait_start, 0); } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Mark the end of the wait period if dequeueing a * waiting task: */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); } /* * We are picking a new current task - update its stats: */ static inline void update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * We are starting a new run period: */ se->exec_start = rq_of(cfs_rq)->clock; } /************************************************** * Scheduling class queueing methods: */ #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED static void add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) { cfs_rq->task_weight += weight; } #else static inline void add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) { } #endif static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; se->on_rq = 1; } static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); } cfs_rq->nr_running--; se->on_rq = 0; } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS struct task_struct *tsk = NULL; if (entity_is_task(se)) tsk = task_of(se); if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; if ((s64)delta < 0) delta = 0; if (unlikely(delta > se->sleep_max)) se->sleep_max = delta; se->sleep_start = 0; se->sum_sleep_runtime += delta; if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); trace_sched_stat_sleep(tsk, delta); } } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; if ((s64)delta < 0) delta = 0; if (unlikely(delta > se->block_max)) se->block_max = delta; se->block_start = 0; se->sum_sleep_runtime += delta; if (tsk) { if (tsk->in_iowait) { se->iowait_sum += delta; se->iowait_count++; trace_sched_stat_iowait(tsk, delta); } /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the * amount of time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } account_scheduler_latency(tsk, delta >> 10, 0); } } #endif } static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG s64 d = se->vruntime - cfs_rq->min_vruntime; if (d < 0) d = -d; if (d > 3*sysctl_sched_latency) schedstat_inc(cfs_rq, nr_spread_over); #endif } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a * little, place the new task so that it fits in the slot that * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ if (!initial && sched_feat(FAIR_SLEEPERS)) { unsigned long thresh = sysctl_sched_latency; /* * Convert the sleeper threshold into virtual time. * SCHED_IDLE is a special sub-class. We care about * fairness only relative to other SCHED_IDLE tasks, * all of which have the same weight. */ if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || task_of(se)->policy != SCHED_IDLE)) thresh = calc_delta_fair(thresh, se); /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ if (sched_feat(GENTLE_FAIR_SLEEPERS)) thresh >>= 1; vruntime -= thresh; } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); se->vruntime = vruntime; } #define ENQUEUE_WAKEUP 1 #define ENQUEUE_MIGRATE 2 static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) se->vruntime += cfs_rq->min_vruntime; /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!se || cfs_rq->last == se) cfs_rq->last = NULL; if (!se || cfs_rq->next == se) cfs_rq->next = NULL; } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { for_each_sched_entity(se) __clear_buddies(cfs_rq_of(se), se); } static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); if (sleep) { #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) se->sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; } #endif } clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */ if (!sleep) se->vruntime -= cfs_rq->min_vruntime; } /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_task(rq_of(cfs_rq)->curr); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); return; } /* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ if (!sched_feat(WAKEUP_PREEMPT)) return; if (delta_exec < sysctl_sched_min_granularity) return; if (cfs_rq->nr_running > 1) { struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } } static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* 'current' is not kept within the tree. */ if (se->on_rq) { /* * Any task has to be enqueued before it get to execute on * a CPU. So account for the time it spent waiting on the * runqueue. */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; #ifdef CONFIG_SCHEDSTATS /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->slice_max = max(se->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } #endif se->prev_sum_exec_runtime = se->sum_exec_runtime; } static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); struct sched_entity *left = se; if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) se = cfs_rq->next; /* * Prefer last buddy, try to return the CPU to a preempted task. */ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; clear_buddies(cfs_rq, se); return se; } static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */ if (prev->on_rq) update_curr(cfs_rq); check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); } cfs_rq->curr = NULL; } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ if (queued) { resched_task(rq_of(cfs_rq)->curr); return; } /* * don't let the period tick interfere with the hrtick preemption */ if (!sched_feat(DOUBLE_TICK) && hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); } /************************************************** * CFS operations on tasks: */ #ifdef CONFIG_SCHED_HRTICK static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); WARN_ON(task_rq(p) != rq); if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; if (delta < 0) { if (rq->curr == p) resched_task(p); return; } /* * Don't schedule slices shorter than 10000ns, that just * doesn't make sense. Rely on vruntime for fairness. */ if (rq->curr != p) delta = max_t(s64, 10000LL, delta); hrtick_start(rq, delta); } } /* * called from enqueue/dequeue and updates the hrtick when the * current task is from our class and nr_running is low enough * to matter. */ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; if (curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } static inline void hrtick_update(struct rq *rq) { } #endif /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int flags = 0; if (wakeup) flags |= ENQUEUE_WAKEUP; if (p->state == TASK_WAKING) flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); flags = ENQUEUE_WAKEUP; } hrtick_update(rq); } /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; sleep = 1; } hrtick_update(rq); } /* * sched_yield() support is very simple - we dequeue and enqueue. * * If compat_yield is turned on then we requeue to the end of the tree. */ static void yield_task_fair(struct rq *rq) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *rightmost, *se = &curr->se; /* * Are we the only task in the tree? */ if (unlikely(cfs_rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); return; } /* * Find the rightmost entry in the rbtree: */ rightmost = __pick_last_entity(cfs_rq); /* * Already in the rightmost position? */ if (unlikely(!rightmost || entity_before(rightmost, se))) return; /* * Minimally necessary key value to be last in the tree: * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ se->vruntime = rightmost->vruntime + 1; } #ifdef CONFIG_SMP static void task_waking_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); se->vruntime -= cfs_rq->min_vruntime; } #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group * * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. * * The problem is that perfectly aligning the shares is rather expensive, hence * we try to avoid doing that too often - see update_shares(), which ratelimits * this change. * * We compensate this by not only taking the current delta into account, but * also considering the delta between when the shares were last adjusted and * now. * * We still saw a performance dip, some tracing learned us that between * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased * significantly. Therefore try to bias the error in direction of failing * the affine wakeup. * */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; /* * By not taking the decrease of shares on the other cpu into * account our error leans towards reducing the affine wakeups. */ if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; for_each_sched_entity(se) { long S, rw, s, a, b; long more_w; /* * Instead of using this increment, also add the difference * between when the shares were last updated and now. */ more_w = se->my_q->load.weight - se->my_q->rq_weight; wl += more_w; wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; rw = se->my_q->rq_weight; a = S*(rw + wl); b = S*rw + s*wg; wl = s*(a-b); if (likely(b)) wl /= b; /* * Assume the group is already running and will * thus already be accounted for in the weight. * * That is, moving shares between CPUs, does not * alter the group weight. */ wg = 0; } return wl; } #else static inline unsigned long effective_load(struct task_group *tg, int cpu, unsigned long wl, unsigned long wg) { return wl; } #endif static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { struct task_struct *curr = current; unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); if (sync) { if (sched_feat(SYNC_LESS) && (curr->se.avg_overlap > sysctl_sched_migration_cost || p->se.avg_overlap > sysctl_sched_migration_cost)) sync = 0; } else { if (sched_feat(SYNC_MORE) && (curr->se.avg_overlap < sysctl_sched_migration_cost && p->se.avg_overlap < sysctl_sched_migration_cost)) sync = 1; } /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: */ if (sync) { tg = task_group(current); weight = current->se.load.weight; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; imbalance = 100 + (sd->imbalance_pct - 100) / 2; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll * always have an imbalance, but there's really nothing you can do * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ balanced = !this_load || 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly * woken task: */ if (sync && balanced) return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); if (balanced || (this_load <= load && this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; } return 0; } /* * find_idlest_group finds and returns the least busy CPU group within the * domain. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; do { unsigned long load, avg_load; int local_group; int i; /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_cpus(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); else load = target_load(i, load_idx); avg_load += load; } /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; this = group; } else if (avg_load < min_load) { min_load = avg_load; idlest = group; } } while (group = group->next, group != sd->groups); if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; } /* * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; idlest = i; } } return idlest; } /* * Try and locate an idle CPU in the sched_domain. */ static int select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) { int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int i; /* * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE * test in select_task_rq_fair) and the prev_cpu is idle then that's * always a better target than the current cpu. */ if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) return prev_cpu; /* * Otherwise, iterate the domain and find an elegible idle cpu. */ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { if (!cpu_rq(i)->cfs.nr_running) { target = i; break; } } return target; } /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. * * Balance, ie. select the least loaded group. * * Returns the target CPU number, or the same CPU if no balancing is needed. * * preempt must be disabled. */ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; int want_sd = 1; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { if (sched_feat(AFFINE_WAKEUPS) && cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; new_cpu = prev_cpu; } for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; /* * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. */ if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { unsigned long power = 0; unsigned long nr_running = 0; unsigned long capacity; int i; for_each_cpu(i, sched_domain_span(tmp)) { power += power_of(i); nr_running += cpu_rq(i)->cfs.nr_running; } capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); if (tmp->flags & SD_POWERSAVINGS_BALANCE) nr_running /= 2; if (nr_running < capacity) want_sd = 0; } /* * While iterating the domains looking for a spanning * WAKE_AFFINE domain, adjust the affine target to any idle cpu * in cache sharing domains along the way. */ if (want_affine) { int target = -1; /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) target = cpu; /* * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { if (tmp->flags & SD_WAKE_AFFINE) { affine_sd = tmp; want_affine = 0; } cpu = target; } } if (!want_sd && !want_affine) break; if (!(tmp->flags & sd_flag)) continue; if (want_sd) sd = tmp; } if (sched_feat(LB_SHARES_UPDATE)) { /* * Pick the largest domain to update shares over */ tmp = sd; if (affine_sd && (!tmp || cpumask_weight(sched_domain_span(affine_sd)) > cpumask_weight(sched_domain_span(sd)))) tmp = affine_sd; if (tmp) update_shares(tmp); } if (affine_sd && wake_affine(affine_sd, p, sync)) return cpu; while (sd) { int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; if (!(sd->flags & sd_flag)) { sd = sd->child; continue; } if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; group = find_idlest_group(sd, p, cpu, load_idx); if (!group) { sd = sd->child; continue; } new_cpu = find_idlest_cpu(group, p, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; continue; } /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; for_each_domain(cpu, tmp) { if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & sd_flag) sd = tmp; } /* while loop will break here if sd == NULL */ } return new_cpu; } #endif /* CONFIG_SMP */ /* * Adaptive granularity * * se->avg_wakeup gives the average time a task runs until it does a wakeup, * with the limit of wakeup_gran -- when it never does a wakeup. * * So the smaller avg_wakeup is the faster we want this task to preempt, * but we don't want to treat the preemptee unfairly and therefore allow it * to run for at least the amount of time we'd like to run. * * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one * * NOTE: we use *nr_running to scale with load, this nicely matches the * degrading latency on load. */ static unsigned long adaptive_gran(struct sched_entity *curr, struct sched_entity *se) { u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; u64 gran = 0; if (this_run < expected_wakeup) gran = expected_wakeup - this_run; return min_t(s64, gran, sysctl_sched_wakeup_granularity); } static unsigned long wakeup_gran(struct sched_entity *curr, struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) gran = adaptive_gran(curr, se); /* * Since its curr running now, convert the gran from real-time * to virtual-time in his units. */ if (sched_feat(ASYM_GRAN)) { /* * By using 'se' instead of 'curr' we penalize light tasks, so * they get preempted easier. That is, if 'se' < 'curr' then * the resulting gran will be larger, therefore penalizing the * lighter, if otoh 'se' > 'curr' then the resulting gran will * be smaller, again penalizing the lighter task. * * This is especially important for buddies when the leftmost * task is higher priority than the buddy. */ if (unlikely(se->load.weight != NICE_0_LOAD)) gran = calc_delta_fair(gran, se); } else { if (unlikely(curr->load.weight != NICE_0_LOAD)) gran = calc_delta_fair(gran, curr); } return gran; } /* * Should 'se' preempt 'curr'. * * |s1 * |s2 * |s3 * g * |<--->|c * * w(c, s1) = -1 * w(c, s2) = 0 * w(c, s3) = 1 * */ static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime; if (vdiff <= 0) return -1; gran = wakeup_gran(curr, se); if (vdiff > gran) return 1; return 0; } static void set_last_buddy(struct sched_entity *se) { if (likely(task_of(se)->policy != SCHED_IDLE)) { for_each_sched_entity(se) cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { if (likely(task_of(se)->policy != SCHED_IDLE)) { for_each_sched_entity(se) cfs_rq_of(se)->next = se; } } /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; if (unlikely(rt_prio(p->prio))) goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; if (unlikely(se == pse)) return; if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. */ if (test_tsk_need_resched(curr)) return; /* * Batch and idle tasks do not preempt (their preemption is driven by * the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; /* Idle tasks are by definition preempted by everybody. */ if (unlikely(curr->policy == SCHED_IDLE)) goto preempt; if (sched_feat(WAKEUP_SYNC) && sync) goto preempt; if (sched_feat(WAKEUP_OVERLAP) && se->avg_overlap < sysctl_sched_migration_cost && pse->avg_overlap < sysctl_sched_migration_cost) goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; update_curr(cfs_rq); find_matching_se(&se, &pse); BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) goto preempt; return; preempt: resched_task(curr); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved * with schedule on the ->pre_schedule() or idle_balance() * point, either of which can * drop the rq lock. * * Also, during early boot the idle thread is in the fair class, * for obvious reasons its a bad idea to schedule back to it. */ if (unlikely(!se->on_rq || curr == rq->idle)) return; if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; if (!cfs_rq->nr_running) return NULL; do { se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); hrtick_start_fair(rq, p); return p; } /* * Account for a descheduled task: */ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); put_prev_entity(cfs_rq, se); } } #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: */ /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu) { deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { int tsk_cache_hot = 0; /* * We do not migrate tasks that are: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } *all_pinned = 0; if (task_running(rq, p)) { schedstat_inc(p, se.nr_failed_migrations_running); return 0; } /* * Aggressive migration if: * 1) task is cache cold, or * 2) too many balance attempts have failed. */ tsk_cache_hot = task_hot(p, rq->clock, sd); if (!tsk_cache_hot || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.nr_forced_migrations); } #endif return 1; } if (tsk_cache_hot) { schedstat_inc(p, se.nr_failed_migrations_hot); return 0; } return 1; } /* * move_one_task tries to move exactly one task from busiest to this_rq, as * part of active balancing operations within "domain". * Returns 1 if successful and 0 otherwise. * * Called with both runqueues locked. */ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle) { struct task_struct *p, *n; struct cfs_rq *cfs_rq; int pinned = 0; for_each_leaf_cfs_rq(busiest, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); /* * Right now, this is only the second place pull_task() * is called, so we can safely collect pull_task() * stats here rather than inside pull_task(). */ schedstat_inc(sd, lb_gained[idle]); return 1; } } return 0; } static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0, pinned = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; pinned = 1; list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); pulled++; rem_load_move -= p->se.load.weight; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ if (idle == CPU_NEWLY_IDLE) break; #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ if (rem_load_move <= 0) break; if (p->prio < *this_best_prio) *this_best_prio = p->prio; } out: /* * Right now, this is one of only two places pull_task() is called, * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled); if (all_pinned) *all_pinned = pinned; return max_load_move - rem_load_move; } #ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { long rem_load_move = max_load_move; int busiest_cpu = cpu_of(busiest); struct task_group *tg; rcu_read_lock(); update_h_load(busiest_cpu); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; /* * empty group */ if (!busiest_cfs_rq->task_weight) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, busiest_cfs_rq); if (!moved_load) continue; moved_load *= busiest_h_load; moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) break; } rcu_read_unlock(); return max_load_move - rem_load_move; } #else static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, idle, all_pinned, this_best_prio, &busiest->cfs); } #endif /* * move_tasks tries to move up to max_load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". * Returns 1 if successful and 0 otherwise. * * Called with both runqueues locked. */ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { unsigned long total_load_moved = 0, load_moved; int this_best_prio = this_rq->curr->prio; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); total_load_moved += load_moved; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; if (raw_spin_is_contended(&this_rq->lock) || raw_spin_is_contended(&busiest->lock)) break; #endif } while (load_moved && max_load_move > total_load_moved); return total_load_moved > 0; } /********** Helpers for find_busiest_group ************************/ /* * sd_lb_stats - Structure to store the statistics of a sched_domain * during load balancing. */ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *this; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_pwr; /* Total power of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ /** Statistics of this group */ unsigned long this_load; unsigned long this_load_per_task; unsigned long this_nr_running; /* Statistics of the busiest group */ unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance; /* Is powersave balance needed for this sd */ struct sched_group *group_min; /* Least loaded group in sd */ struct sched_group *group_leader; /* Group which relieves group_min */ unsigned long min_load_per_task; /* load_per_task in group_min */ unsigned long leader_nr_running; /* Nr running of group_leader */ unsigned long min_nr_running; /* Nr running of group_min */ #endif }; /* * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; int group_imb; /* Is there an imbalance in the group ? */ }; /** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. */ static inline unsigned int group_first_cpu(struct sched_group *group) { return cpumask_first(sched_group_cpus(group)); } /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. * @idle: The Idle status of the CPU for whose sd load_icx is obtained. */ static inline int get_sd_load_idx(struct sched_domain *sd, enum cpu_idle_type idle) { int load_idx; switch (idle) { case CPU_NOT_IDLE: load_idx = sd->busy_idx; break; case CPU_NEWLY_IDLE: load_idx = sd->newidle_idx; break; default: load_idx = sd->idle_idx; break; } return load_idx; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) /** * init_sd_power_savings_stats - Initialize power savings statistics for * the given sched_domain, during load balancing. * * @sd: Sched domain whose power-savings statistics are to be initialized. * @sds: Variable containing the statistics for sd. * @idle: Idle status of the CPU at which we're performing load-balancing. */ static inline void init_sd_power_savings_stats(struct sched_domain *sd, struct sd_lb_stats *sds, enum cpu_idle_type idle) { /* * Busy processors will not participate in power savings * balance. */ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) sds->power_savings_balance = 0; else { sds->power_savings_balance = 1; sds->min_nr_running = ULONG_MAX; sds->leader_nr_running = 0; } } /** * update_sd_power_savings_stats - Update the power saving stats for a * sched_domain while performing load balancing. * * @group: sched_group belonging to the sched_domain under consideration. * @sds: Variable containing the statistics of the sched_domain * @local_group: Does group contain the CPU for which we're performing * load balancing ? * @sgs: Variable containing the statistics of the group. */ static inline void update_sd_power_savings_stats(struct sched_group *group, struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) { if (!sds->power_savings_balance) return; /* * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ if (local_group && (sds->this_nr_running >= sgs->group_capacity || !sds->this_nr_running)) sds->power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ if (!sds->power_savings_balance || sgs->sum_nr_running >= sgs->group_capacity || !sgs->sum_nr_running) return; /* * Calculate the group which has the least non-idle load. * This is the group from where we need to pick up the load * for saving power */ if ((sgs->sum_nr_running < sds->min_nr_running) || (sgs->sum_nr_running == sds->min_nr_running && group_first_cpu(group) > group_first_cpu(sds->group_min))) { sds->group_min = group; sds->min_nr_running = sgs->sum_nr_running; sds->min_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; } /* * Calculate the group which is almost near its * capacity but still has some space to pick up some load * from other group and save more power */ if (sgs->sum_nr_running + 1 > sgs->group_capacity) return; if (sgs->sum_nr_running > sds->leader_nr_running || (sgs->sum_nr_running == sds->leader_nr_running && group_first_cpu(group) < group_first_cpu(sds->group_leader))) { sds->group_leader = group; sds->leader_nr_running = sgs->sum_nr_running; } } /** * check_power_save_busiest_group - see if there is potential for some power-savings balance * @sds: Variable containing the statistics of the sched_domain * under consideration. * @this_cpu: Cpu at which we're currently performing load-balancing. * @imbalance: Variable to store the imbalance. * * Description: * Check if we have potential to perform some power-savings balance. * If yes, set the busiest group to be the least loaded group in the * sched_domain, so that it's CPUs can be put to idle. * * Returns 1 if there is potential to perform power-savings balance. * Else returns 0. */ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, int this_cpu, unsigned long *imbalance) { if (!sds->power_savings_balance) return 0; if (sds->this != sds->group_leader || sds->group_leader == sds->group_min) return 0; *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; return 1; } #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ static inline void init_sd_power_savings_stats(struct sched_domain *sd, struct sd_lb_stats *sds, enum cpu_idle_type idle) { return; } static inline void update_sd_power_savings_stats(struct sched_group *group, struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) { return; } static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, int this_cpu, unsigned long *imbalance) { return 0; } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_LOAD_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) { return default_scale_freq_power(sd, cpu); } unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; smt_gain /= weight; return smt_gain; } unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) { return default_scale_smt_power(sd, cpu); } unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available; sched_avg_update(rq); total = sched_avg_period() + (rq->clock - rq->age_stamp); available = total - rq->rt_avg; if (unlikely((s64)total < SCHED_LOAD_SCALE)) total = SCHED_LOAD_SCALE; total >>= SCHED_LOAD_SHIFT; return div_u64(available, total); } static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); else power *= default_scale_freq_power(sd, cpu); power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); else power *= default_scale_smt_power(sd, cpu); power >>= SCHED_LOAD_SHIFT; } power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; if (!power) power = 1; sdg->cpu_power = power; } static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; if (!child) { update_cpu_power(sd, cpu); return; } power = 0; group = child->groups; do { power += group->cpu_power; group = group->next; } while (group != child->groups); sdg->cpu_power = power; } /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, enum cpu_idle_type idle, int load_idx, int *sd_idle, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { unsigned long load, max_cpu_load, min_cpu_load; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; if (local_group) balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { first_idle_cpu = 1; balance_cpu = i; } load = target_load(i, load_idx); } else { load = source_load(i, load_idx); if (load > max_cpu_load) max_cpu_load = load; if (min_cpu_load > load) min_cpu_load = load; } sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; sgs->sum_weighted_load += weighted_cpuload(i); } /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ if (idle != CPU_NEWLY_IDLE && local_group && balance_cpu != this_cpu) { *balance = 0; return; } update_group_power(sd, this_cpu); /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; /* * Consider the group unbalanced when the imbalance is larger * than the average weight of two tasks. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a * normalized nr_running number somewhere that negates * the hierarchy? */ if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); } /** * update_sd_lb_stats - Update sched_group's statistics for load balancing. * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @sd_idle: Idle status of the sched_domain containing group. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, enum cpu_idle_type idle, int *sd_idle, const struct cpumask *cpus, int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *group = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); do { int local_group; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; sds->total_pwr += group->cpu_power; /* * In case the child domain prefers tasks go to siblings * first, lower the group capacity to one so that we'll try * and move all the excess tasks away. */ if (prefer_sibling) sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; sds->this = group; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; } else if (sgs.avg_load > sds->max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { sds->max_load = sgs.avg_load; sds->busiest = group; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); } /** * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. * @this_cpu: The cpu at whose sched_domain we're performing load-balance. * @imbalance: Variable to store the imbalance. */ static inline void fix_small_imbalance(struct sd_lb_stats *sds, int this_cpu, unsigned long *imbalance) { unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; if (sds->this_nr_running) { sds->this_load_per_task /= sds->this_nr_running; if (sds->busiest_load_per_task > sds->this_load_per_task) imbn = 1; } else sds->this_load_per_task = cpu_avg_load_per_task(this_cpu); scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_LOAD_SCALE; scaled_busy_load_per_task /= sds->busiest->cpu_power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { *imbalance = sds->busiest_load_per_task; return; } /* * OK, we don't have enough imbalance to justify moving tasks, * however we may be able to increase total CPU power used by * moving them. */ pwr_now += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load); pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / sds->busiest->cpu_power; if (sds->max_load > tmp) pwr_move += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ if (sds->max_load * sds->busiest->cpu_power < sds->busiest_load_per_task * SCHED_LOAD_SCALE) tmp = (sds->max_load * sds->busiest->cpu_power) / sds->this->cpu_power; else tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / sds->this->cpu_power; pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) *imbalance = sds->busiest_load_per_task; } /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. * @sds: statistics of the sched_domain whose imbalance is to be calculated. * @this_cpu: Cpu for which currently load balance is being performed. * @imbalance: The variable to store the imbalance. */ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, unsigned long *imbalance) { unsigned long max_pull, load_above_capacity = ~0UL; sds->busiest_load_per_task /= sds->busiest_nr_running; if (sds->group_imb) { sds->busiest_load_per_task = min(sds->busiest_load_per_task, sds->avg_load); } /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ if (sds->max_load < sds->avg_load) { *imbalance = 0; return fix_small_imbalance(sds, this_cpu, imbalance); } if (!sds->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. */ load_above_capacity = (sds->busiest_nr_running - sds->busiest_group_capacity); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); load_above_capacity /= sds->busiest->cpu_power; } /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load. At the same time, * we also don't want to reduce the group load below the group capacity * (so that we can implement power-savings policies etc). Thus we look * for the minimum possible imbalance. * Be careful of negative numbers as they'll appear as very large values * with unsigned longs. */ max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, (sds->avg_load - sds->this_load) * sds->this->cpu_power) / SCHED_LOAD_SCALE; /* * if *imbalance is less than the average load per runnable task * there is no gaurantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ if (*imbalance < sds->busiest_load_per_task) return fix_small_imbalance(sds, this_cpu, imbalance); } /******* find_busiest_group() helpers end here *********************/ /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. If there isn't an imbalance, and * the user has opted for power-savings, it returns a group whose * CPUs can be put to idle by rebalancing those tasks elsewhere, if * such a group exists. * * Also calculates the amount of weighted load which should be moved * to restore balance. * * @sd: The sched_domain whose busiest group is to be returned. * @this_cpu: The cpu for which load balancing is currently being performed. * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. * * Returns: - the busiest group if imbalance exists. * - If no imbalance and user has opted for power-savings balance, * return the least loaded group whose CPUs can be * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; memset(&sds, 0, sizeof(sds)); /* * Compute the various statistics relavent for load balancing at * this level. */ update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds); /* Cases where imbalance does not exist from POV of this_cpu */ /* 1) this_cpu is not the appropriate cpu to perform load balancing * at this level. * 2) There is no busy sibling group to pull from. * 3) This group is the busiest group. * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. */ if (!(*balance)) goto ret; if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; if (sds.this_load >= sds.max_load) goto out_balanced; sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; out_balanced: /* * There is no obvious imbalance. But check if we can do some balancing * to save power. */ if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) return sds.busiest; ret: *imbalance = 0; return NULL; } /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); wl = weighted_cpuload(i); /* * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu power. */ if (capacity && rq->nr_running == 1 && wl > imbalance) continue; /* * For the load comparisons with the other cpu's, consider * the weighted_cpuload() scaled with the cpu power, so that * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ wl = (wl * SCHED_LOAD_SCALE) / power; if (wl > max_load) { max_load = wl; busiest = rq; } } return busiest; } /* * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough. */ #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) { if (idle == CPU_NEWLY_IDLE) { /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU * package. * * The package power saving logic comes from * find_busiest_group(). If there are no imbalance, then * f_b_g() will return NULL. However when sched_mc={1,2} then * f_b_g() will select a group from which a running task may be * pulled to this cpu in order to make the other package idle. * If there is no opportunity to make a package idle and if * there are no imbalance, then f_b_g() will return NULL and no * action will be taken in load_balance_newidle(). * * Under normal task pull operation due to imbalance, there * will be more than one task in the source run queue and * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return 0; if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as CPU_IDLE, instead of * portraying it as CPU_NOT_IDLE. */ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_count[idle]); redo: update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); if (*balance == 0) goto out_balanced; if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } } if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; if (need_active_balance(sd, sd_idle, idle)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else sd->nr_balance_failed = 0; if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { /* * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call * move_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; } if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) ld_moved = -1; goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) ld_moved = -1; else ld_moved = 0; out: if (ld_moved) update_shares(sd); return ld_moved; } /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; this_rq->idle_stamp = this_rq->clock; if (this_rq->avg_idle < sysctl_sched_migration_cost) return; /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; if (!(sd->flags & SD_LOAD_BALANCE)) continue; if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &balance); } interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; if (pulled_task) { this_rq->idle_stamp = 0; break; } } raw_spin_lock(&this_rq->lock); if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on * a busy processor. So reset next_balance. */ this_rq->next_balance = next_balance; } } /* * active_load_balance is run by migration threads. It pushes running tasks * off the busiest CPU onto idle CPUs. It requires at least 1 task to be * running on each physical CPU where possible, and avoids physical / * logical imbalances. * * Called with busiest_rq locked. */ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) { int target_cpu = busiest_rq->push_cpu; struct sched_domain *sd; struct rq *target_rq; /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) return; target_rq = cpu_rq(target_cpu); /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-cpu setup. */ BUG_ON(busiest_rq == target_rq); /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); update_rq_clock(busiest_rq); update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } if (likely(sd)) { schedstat_inc(sd, alb_count); if (move_one_task(target_rq, target_cpu, busiest_rq, sd, CPU_IDLE)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } double_unlock_balance(busiest_rq, target_rq); } #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; int get_nohz_load_balancer(void) { return atomic_read(&nohz.load_balancer); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) /** * lowest_flag_domain - Return lowest sched_domain containing flag. * @cpu: The cpu whose lowest level of sched domain is to * be returned. * @flag: The flag to check for the lowest sched_domain * for the given cpu. * * Returns the lowest sched_domain of a cpu which contains the given flag. */ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) { struct sched_domain *sd; for_each_domain(cpu, sd) if (sd && (sd->flags & flag)) break; return sd; } /** * for_each_flag_domain - Iterates over sched_domains containing the flag. * @cpu: The cpu whose domains we're iterating over. * @sd: variable holding the value of the power_savings_sd * for cpu. * @flag: The flag to filter the sched_domains to be iterated. * * Iterates over all the scheduler domains for a given cpu that has the 'flag' * set, starting from the lowest sched_domain to the highest. */ #define for_each_flag_domain(cpu, sd, flag) \ for (sd = lowest_flag_domain(cpu, flag); \ (sd && (sd->flags & flag)); sd = sd->parent) /** * is_semi_idle_group - Checks if the given sched_group is semi-idle. * @ilb_group: group to be checked for semi-idleness * * Returns: 1 if the group is semi-idle. 0 otherwise. * * We define a sched_group to be semi idle if it has atleast one idle-CPU * and atleast one non-idle CPU. This helper function checks if the given * sched_group is semi-idle or not. */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ if (cpumask_empty(nohz.ilb_grp_nohz_mask)) return 0; if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) return 0; return 1; } /** * find_new_ilb - Finds the optimum idle load balancer for nomination. * @cpu: The cpu which is nominating a new idle_load_balancer. * * Returns: Returns the id of the idle load balancer if it exists, * Else, returns >= nr_cpu_ids. * * This algorithm picks the idle load balancer such that it belongs to a * semi-idle powersavings sched_domain. The idea is to try and avoid * completely idle packages/cores just for the purpose of idle load balancing * when there are other idle cpu's which are better suited for that job. */ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; /* * Have idle load balancer selection from semi-idle packages only * when power-aware load balancing is enabled */ if (!(sched_smt_power_savings || sched_mc_power_savings)) goto out_done; /* * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ if (cpumask_weight(nohz.cpu_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { if (is_semi_idle_group(ilb_group)) return cpumask_first(nohz.ilb_grp_nohz_mask); ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } out_done: return cpumask_first(nohz.cpu_mask); } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { return cpumask_first(nohz.cpu_mask); } #endif /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle * load balancing on behalf of all those cpus. If all the cpus in the system * go into this tickless mode, then there will be no ilb owner (as there is * no need for one) and all the cpus will sleep till the next wakeup event * arrives... * * For the ilb owner, tick is not stopped. And this tick will be used * for idle load balancing. ilb owner will still be part of * nohz.cpu_mask.. * * While stopping the tick, this cpu will become the ilb owner if there * is no other owner. And will be the owner till that cpu becomes busy * or if all cpus in the system stop their ticks at which point * there is no need for ilb owner. * * When the ilb owner becomes busy, it nominates another owner, during the * next busy scheduler_tick() */ int select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { cpu_rq(cpu)->in_nohz_recently = 1; if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) return 0; /* * If we are going offline and still the leader, * give up! */ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); return 0; } cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; } if (atomic_read(&nohz.load_balancer) == -1) { /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; } else if (atomic_read(&nohz.load_balancer) == cpu) { int new_ilb; if (!(sched_smt_power_savings || sched_mc_power_savings)) return 1; /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { atomic_set(&nohz.load_balancer, -1); resched_cpu(new_ilb); return 0; } return 1; } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); } return 0; } #endif static DEFINE_SPINLOCK(balancing); /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; if (interval > HZ*NR_CPUS/10) interval = HZ*NR_CPUS/10; need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is * not idle. */ idle = CPU_NOT_IDLE; } sd->last_balance = jiffies; } if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; } /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively. */ if (!balance) break; } /* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated. */ if (likely(update_next_balance)) rq->next_balance = next_balance; } /* * run_rebalance_domains is triggered when needed from the scheduler tick. * In CONFIG_NO_HZ case, the idle load balance owner will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); #ifdef CONFIG_NO_HZ /* * If this cpu is the owner for idle load balancing, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { struct rq *rq; int balance_cpu; for_each_cpu(balance_cpu, nohz.cpu_mask) { if (balance_cpu == this_cpu) continue; /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load * balancing owner will pick it up. */ if (need_resched()) break; rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } } #endif } static inline int on_null_domain(int cpu) { return !rcu_dereference(cpu_rq(cpu)->sd); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. * * In case of CONFIG_NO_HZ, this is the place where we nominate a new * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current * scheduler tick, then check if we need to nominate new idle * load balancer. */ if (rq->in_nohz_recently && !rq->idle_at_tick) { rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } if (atomic_read(&nohz.load_balancer) == -1) { int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); } } /* * If this cpu is idle and doing idle load balancing for all the * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } /* * If this cpu is idle and the idle load balancing is done by * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); } static void rq_online_fair(struct rq *rq) { update_sysctl(); } static void rq_offline_fair(struct rq *rq) { update_sysctl(); } #else /* CONFIG_SMP */ /* * on UP we do not need to balance between CPUs: */ static inline void idle_balance(int cpu, struct rq *rq) { } #endif /* CONFIG_SMP */ /* * scheduler tick hitting a task of our scheduling class: */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } } /* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); if (unlikely(task_cpu(p) != this_cpu)) __set_task_cpu(p, this_cpu); update_curr(cfs_rq); if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); resched_task(rq->curr); } se->vruntime -= cfs_rq->min_vruntime; raw_spin_unlock_irqrestore(&rq->lock, flags); } /* * Priority of the task has changed. Check to see if we preempt * the current task. */ static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio, int running) { /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ if (running) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } /* * We switched to the sched_fair class. */ static void switched_to_fair(struct rq *rq, struct task_struct *p, int running) { /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ if (running) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ static void set_curr_task_fair(struct rq *rq) { struct sched_entity *se = &rq->curr->se; for_each_sched_entity(se) set_next_entity(cfs_rq_of(se), se); } #ifdef CONFIG_FAIR_GROUP_SCHED static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); if (!on_rq) place_entity(cfs_rq, &p->se, 1); } #endif static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); return rr_interval; } /* * All the scheduling class methods: */ static const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED .moved_group = moved_group_fair, #endif }; #ifdef CONFIG_SCHED_DEBUG static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } #endif