summaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2018-04-26 09:54:27 -0400
committerJonathan Corbet <corbet@lwn.net>2018-04-27 18:45:49 -0400
commit5385a295ec00eb80525ec7ff1d97e13e06ba77b7 (patch)
tree66ee4ad7e475ec5406d9f81ad5498619809d15d6 /scripts
parentf91af1c69c57664a508fa054ce1e2cdf74741f00 (diff)
scripts: Add SPDX checker script
The SPDX-License-Identifiers are growing in the kernel and so grow expression failures and license IDs are used which have no corresponding license text file in the LICENSES directory. Add a script which gathers information from the LICENSES directory, i.e. the various tags in the licenses and exception files and then scans either input from stdin, which it treats as a single file or if started without arguments it scans the full kernel tree. It checks whether the license expression syntax is correct and also validates whether the license identifiers used in the expressions are available in the LICENSES files. scripts/spdxcheck.py -h usage: spdxcheck.py [-h] [-m MAXLINES] [-v] [path [path ...]] SPDX expression checker positional arguments: path Check path or file. If not given full git tree scan. For stdin use "-" optional arguments: -h, --help show this help message and exit -m MAXLINES, --maxlines MAXLINES Maximum number of lines to scan in a file. Default 15 -v, --verbose Verbose statistics output include/dt-bindings/reset/amlogic,meson-axg-reset.h: 9:41 Invalid License ID: BSD drivers/pinctrl/sh-pfc/pfc-r8a77965.c: 1:28 Invalid License ID: GPL-2. include/dt-bindings/reset/amlogic,meson-axg-reset.h: 9:41 Invalid License ID: BSD arch/x86/kernel/jailhouse.c: 1:28 Invalid License ID: GPL2.0 include/dt-bindings/reset/amlogic,meson-axg-reset.h: 9:41 Invalid License ID: BSD arch/arm/mach-s3c24xx/h1940-bluetooth.c: 1:28 Invalid License ID: GPL-1.0 arch/x86/kernel/jailhouse.c: 1:28 Invalid License ID: GPL2.0 drivers/pinctrl/sh-pfc/pfc-r8a77965.c: 1:28 Invalid License ID: GPL-2. include/dt-bindings/reset/amlogic,meson-axg-reset.h: 9:41 Invalid License ID: BSD arch/x86/include/asm/jailhouse_para.h: 1:28 Invalid License ID: GPL2.0 arch/arm/mach-s3c24xx/h1940-bluetooth.c: 1:28 Invalid License ID: GPL-1.0 arch/x86/kernel/jailhouse.c: 1:28 Invalid License ID: GPL2.0 drivers/pinctrl/sh-pfc/pfc-r8a77965.c: 1:28 Invalid License ID: GPL-2. include/dt-bindings/reset/amlogic,meson-axg-reset.h: 9:41 Invalid License ID: BSD arch/x86/include/asm/jailhouse_para.h: 1:28 Invalid License ID: GPL2.0 License files: 14 Exception files: 1 License IDs 19 Exception IDs 1 Files checked: 61332 Lines checked: 669181 Files with SPDX: 16169 Files with errors: 5 real 0m2.642s user 0m2.231s sys 0m0.467s That's a full tree sweep on my laptop. Note, this runs single threaded. It scans by default the first 15 lines for a SPDX identifier where the current max inside a top comment is at line 10. But that's going to be faster once the identifiers are all in the first two lines as documented. The python wizards will surely know how to do that smarter and faster, but its at least better than no tool at all. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> [jc: Fixed ironically erroneous SPDX tag and did chmod +x ] Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/spdxcheck.py284
1 files changed, 284 insertions, 0 deletions
diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py
new file mode 100755
index 000000000000..7deaef297f52
--- /dev/null
+++ b/scripts/spdxcheck.py
@@ -0,0 +1,284 @@
1#!/usr/bin/env python
2# SPDX-License-Identifier: GPL-2.0
3# Copyright Thomas Gleixner <tglx@linutronix.de>
4
5from argparse import ArgumentParser
6from ply import lex, yacc
7import traceback
8import sys
9import git
10import re
11import os
12
13class ParserException(Exception):
14 def __init__(self, tok, txt):
15 self.tok = tok
16 self.txt = txt
17
18class SPDXException(Exception):
19 def __init__(self, el, txt):
20 self.el = el
21 self.txt = txt
22
23class SPDXdata(object):
24 def __init__(self):
25 self.license_files = 0
26 self.exception_files = 0
27 self.licenses = [ ]
28 self.exceptions = { }
29
30# Read the spdx data from the LICENSES directory
31def read_spdxdata(repo):
32
33 # The subdirectories of LICENSES in the kernel source
34 license_dirs = [ "preferred", "other", "exceptions" ]
35 lictree = repo.heads.master.commit.tree['LICENSES']
36
37 spdx = SPDXdata()
38
39 for d in license_dirs:
40 for el in lictree[d].traverse():
41 if not os.path.isfile(el.path):
42 continue
43
44 exception = None
45 for l in open(el.path).readlines():
46 if l.startswith('Valid-License-Identifier:'):
47 lid = l.split(':')[1].strip().upper()
48 if lid in spdx.licenses:
49 raise SPDXException(el, 'Duplicate License Identifier: %s' %lid)
50 else:
51 spdx.licenses.append(lid)
52
53 elif l.startswith('SPDX-Exception-Identifier:'):
54 exception = l.split(':')[1].strip().upper()
55 spdx.exceptions[exception] = []
56
57 elif l.startswith('SPDX-Licenses:'):
58 for lic in l.split(':')[1].upper().strip().replace(' ', '').replace('\t', '').split(','):
59 if not lic in spdx.licenses:
60 raise SPDXException(None, 'Exception %s missing license %s' %(ex, lic))
61 spdx.exceptions[exception].append(lic)
62
63 elif l.startswith("License-Text:"):
64 if exception:
65 if not len(spdx.exceptions[exception]):
66 raise SPDXException(el, 'Exception %s is missing SPDX-Licenses' %excid)
67 spdx.exception_files += 1
68 else:
69 spdx.license_files += 1
70 break
71 return spdx
72
73class id_parser(object):
74
75 reserved = [ 'AND', 'OR', 'WITH' ]
76 tokens = [ 'LPAR', 'RPAR', 'ID', 'EXC' ] + reserved
77
78 precedence = ( ('nonassoc', 'AND', 'OR'), )
79
80 t_ignore = ' \t'
81
82 def __init__(self, spdx):
83 self.spdx = spdx
84 self.lasttok = None
85 self.lastid = None
86 self.lexer = lex.lex(module = self, reflags = re.UNICODE)
87 # Initialize the parser. No debug file and no parser rules stored on disk
88 # The rules are small enough to be generated on the fly
89 self.parser = yacc.yacc(module = self, write_tables = False, debug = False)
90 self.lines_checked = 0
91 self.checked = 0
92 self.spdx_valid = 0
93 self.spdx_errors = 0
94 self.curline = 0
95 self.deepest = 0
96
97 # Validate License and Exception IDs
98 def validate(self, tok):
99 id = tok.value.upper()
100 if tok.type == 'ID':
101 if not id in self.spdx.licenses:
102 raise ParserException(tok, 'Invalid License ID')
103 self.lastid = id
104 elif tok.type == 'EXC':
105 if not self.spdx.exceptions.has_key(id):
106 raise ParserException(tok, 'Invalid Exception ID')
107 if self.lastid not in self.spdx.exceptions[id]:
108 raise ParserException(tok, 'Exception not valid for license %s' %self.lastid)
109 self.lastid = None
110 elif tok.type != 'WITH':
111 self.lastid = None
112
113 # Lexer functions
114 def t_RPAR(self, tok):
115 r'\)'
116 self.lasttok = tok.type
117 return tok
118
119 def t_LPAR(self, tok):
120 r'\('
121 self.lasttok = tok.type
122 return tok
123
124 def t_ID(self, tok):
125 r'[A-Za-z.0-9\-+]+'
126
127 if self.lasttok == 'EXC':
128 print(tok)
129 raise ParserException(tok, 'Missing parentheses')
130
131 tok.value = tok.value.strip()
132 val = tok.value.upper()
133
134 if val in self.reserved:
135 tok.type = val
136 elif self.lasttok == 'WITH':
137 tok.type = 'EXC'
138
139 self.lasttok = tok.type
140 self.validate(tok)
141 return tok
142
143 def t_error(self, tok):
144 raise ParserException(tok, 'Invalid token')
145
146 def p_expr(self, p):
147 '''expr : ID
148 | ID WITH EXC
149 | expr AND expr
150 | expr OR expr
151 | LPAR expr RPAR'''
152 pass
153
154 def p_error(self, p):
155 if not p:
156 raise ParserException(None, 'Unfinished license expression')
157 else:
158 raise ParserException(p, 'Syntax error')
159
160 def parse(self, expr):
161 self.lasttok = None
162 self.lastid = None
163 self.parser.parse(expr, lexer = self.lexer)
164
165 def parse_lines(self, fd, maxlines, fname):
166 self.checked += 1
167 self.curline = 0
168 try:
169 for line in fd:
170 self.curline += 1
171 if self.curline > maxlines:
172 break
173 self.lines_checked += 1
174 if line.find("SPDX-License-Identifier:") < 0:
175 continue
176 expr = line.split(':')[1].replace('*/', '').strip()
177 self.parse(expr)
178 self.spdx_valid += 1
179 #
180 # Should we check for more SPDX ids in the same file and
181 # complain if there are any?
182 #
183 break
184
185 except ParserException as pe:
186 if pe.tok:
187 col = line.find(expr) + pe.tok.lexpos
188 tok = pe.tok.value
189 sys.stdout.write('%s: %d:%d %s: %s\n' %(fname, self.curline, col, pe.txt, tok))
190 else:
191 sys.stdout.write('%s: %d:0 %s\n' %(fname, self.curline, col, pe.txt))
192 self.spdx_errors += 1
193
194def scan_git_tree(tree):
195 for el in tree.traverse():
196 # Exclude stuff which would make pointless noise
197 # FIXME: Put this somewhere more sensible
198 if el.path.startswith("LICENSES"):
199 continue
200 if el.path.find("license-rules.rst") >= 0:
201 continue
202 if el.path == 'scripts/checkpatch.pl':
203 continue
204 if not os.path.isfile(el.path):
205 continue
206 parser.parse_lines(open(el.path), args.maxlines, el.path)
207
208def scan_git_subtree(tree, path):
209 for p in path.strip('/').split('/'):
210 tree = tree[p]
211 scan_git_tree(tree)
212
213if __name__ == '__main__':
214
215 ap = ArgumentParser(description='SPDX expression checker')
216 ap.add_argument('path', nargs='*', help='Check path or file. If not given full git tree scan. For stdin use "-"')
217 ap.add_argument('-m', '--maxlines', type=int, default=15,
218 help='Maximum number of lines to scan in a file. Default 15')
219 ap.add_argument('-v', '--verbose', action='store_true', help='Verbose statistics output')
220 args = ap.parse_args()
221
222 # Sanity check path arguments
223 if '-' in args.path and len(args.path) > 1:
224 sys.stderr.write('stdin input "-" must be the only path argument\n')
225 sys.exit(1)
226
227 try:
228 # Use git to get the valid license expressions
229 repo = git.Repo(os.getcwd())
230 assert not repo.bare
231
232 # Initialize SPDX data
233 spdx = read_spdxdata(repo)
234
235 # Initilize the parser
236 parser = id_parser(spdx)
237
238 except SPDXException as se:
239 if se.el:
240 sys.stderr.write('%s: %s\n' %(se.el.path, se.txt))
241 else:
242 sys.stderr.write('%s\n' %se.txt)
243 sys.exit(1)
244
245 except Exception as ex:
246 sys.stderr.write('FAIL: %s\n' %ex)
247 sys.stderr.write('%s\n' %traceback.format_exc())
248 sys.exit(1)
249
250 try:
251 if len(args.path) and args.path[0] == '-':
252 parser.parse_lines(sys.stdin, args.maxlines, '-')
253 else:
254 if args.path:
255 for p in args.path:
256 if os.path.isfile(p):
257 parser.parse_lines(open(p), args.maxlines, p)
258 elif os.path.isdir(p):
259 scan_git_subtree(repo.head.reference.commit.tree, p)
260 else:
261 sys.stderr.write('path %s does not exist\n' %p)
262 sys.exit(1)
263 else:
264 # Full git tree scan
265 scan_git_tree(repo.head.commit.tree)
266
267 if args.verbose:
268 sys.stderr.write('\n')
269 sys.stderr.write('License files: %12d\n' %spdx.license_files)
270 sys.stderr.write('Exception files: %12d\n' %spdx.exception_files)
271 sys.stderr.write('License IDs %12d\n' %len(spdx.licenses))
272 sys.stderr.write('Exception IDs %12d\n' %len(spdx.exceptions))
273 sys.stderr.write('\n')
274 sys.stderr.write('Files checked: %12d\n' %parser.checked)
275 sys.stderr.write('Lines checked: %12d\n' %parser.lines_checked)
276 sys.stderr.write('Files with SPDX: %12d\n' %parser.spdx_valid)
277 sys.stderr.write('Files with errors: %12d\n' %parser.spdx_errors)
278
279 sys.exit(0)
280
281 except Exception as ex:
282 sys.stderr.write('FAIL: %s\n' %ex)
283 sys.stderr.write('%s\n' %traceback.format_exc())
284 sys.exit(1)