Skip to content

Commit b1cf4aa

Browse files
committed
Add support for external licenses in scans #480
This adds `-dir` or `--additional-directories` as a command line option in license detection. This allows users to specify paths to directories of licenses and rules they'd like to use during license detection, but would not like to add to the ScanCode database of licenses. This involves adding a new option in `licensedcode/plugin_license.py`, and this option is used as a parameter in `scancode/api.py`. In this approach, the licenses and rules contained in these additional directories are combined with the existing licenses and rules in the ScanCode database to produce a single index. The code for this is found in `licensedcode/cache.py` and the helper methods for loading these licenses and rules are found in `licensedcode/models.py`. This commit also includes a unit test to verify that license detection succeeds with an additional directory found in `tests/licensedcode/test_plugin_license.py`. Part of the setup for the unit test and future tests involves creating a new directory in `tests/licensedcode/data` that contains sample external licenses used in the unit tests. Signed-off-by: Kevin Ji <[email protected]>
1 parent aba3112 commit b1cf4aa

File tree

12 files changed

+256
-29
lines changed

12 files changed

+256
-29
lines changed

src/licensedcode/cache.py

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
# global in-memory cache of the LicenseCache
3131
_LICENSE_CACHE = None
32+
_CACHED_DIRECTORIES = []
3233

3334
LICENSE_INDEX_LOCK_TIMEOUT = 60 * 4
3435
LICENSE_INDEX_DIR = 'license_index'
@@ -58,6 +59,7 @@ def load_or_build(
5859
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
5960
licenses_data_dir=None,
6061
rules_data_dir=None,
62+
additional_directories=None,
6163
):
6264
"""
6365
Load or build and save and return a LicenseCache object.
@@ -92,6 +94,8 @@ def load_or_build(
9294
from licensedcode.models import licenses_data_dir as ldd
9395
from licensedcode.models import rules_data_dir as rdd
9496
from licensedcode.models import load_licenses
97+
from licensedcode.models import load_licenses_from_multiple_dirs
98+
from licensedcode.models import get_license_dirs
9599
from scancode import lockfile
96100

97101
licenses_data_dir = licenses_data_dir or ldd
@@ -106,13 +110,21 @@ def load_or_build(
106110
# Here, the cache is either stale or non-existing: we need to
107111
# rebuild all cached data (e.g. mostly the index) and cache it
108112

109-
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
113+
if additional_directories:
114+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
115+
combined_directories = [licenses_data_dir] + additional_license_dirs
116+
licenses_db = load_licenses_from_multiple_dirs(license_directories=combined_directories)
117+
else:
118+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
110119

120+
# create a single merged index containing license data from licenses_data_dir
121+
# and data from additional directories
111122
index = build_index(
112123
licenses_db=licenses_db,
113124
licenses_data_dir=licenses_data_dir,
114125
rules_data_dir=rules_data_dir,
115126
index_all_languages=index_all_languages,
127+
additional_directories=additional_directories,
116128
)
117129

118130
spdx_symbols = build_spdx_symbols(licenses_db=licenses_db)
@@ -143,6 +155,7 @@ def build_index(
143155
licenses_data_dir=None,
144156
rules_data_dir=None,
145157
index_all_languages=False,
158+
additional_directories=None,
146159
):
147160
"""
148161
Return an index built from rules and licenses directories
@@ -151,19 +164,35 @@ def build_index(
151164
Otherwise, only include the English license texts and rules (the default)
152165
"""
153166
from licensedcode.index import LicenseIndex
167+
from licensedcode.models import get_license_dirs
168+
from licensedcode.models import get_rule_dirs
154169
from licensedcode.models import get_rules
170+
from licensedcode.models import get_rules_from_multiple_dirs
155171
from licensedcode.models import get_all_spdx_key_tokens
156172
from licensedcode.models import get_license_tokens
157173
from licensedcode.models import licenses_data_dir as ldd
158174
from licensedcode.models import rules_data_dir as rdd
159175
from licensedcode.models import load_licenses
176+
from licensedcode.models import load_licenses_from_multiple_dirs
160177
from licensedcode.legalese import common_license_words
161178

162179
licenses_data_dir = licenses_data_dir or ldd
163180
rules_data_dir = rules_data_dir or rdd
164181

165-
licenses_db = licenses_db or load_licenses(licenses_data_dir=licenses_data_dir)
166-
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
182+
if not licenses_db:
183+
if additional_directories:
184+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
185+
combined_license_directories = [licenses_data_dir] + additional_license_dirs
186+
licenses_db = load_licenses_from_multiple_dirs(license_dirs=combined_license_directories)
187+
else:
188+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
189+
190+
if additional_directories:
191+
additional_rule_dirs = get_rule_dirs(additional_dirs=additional_directories)
192+
combined_rule_directories = [rules_data_dir] + additional_rule_dirs
193+
rules = get_rules_from_multiple_dirs(licenses_db=licenses_db, rule_directories=combined_rule_directories)
194+
else:
195+
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
167196

168197
legalese = common_license_words
169198
spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
@@ -299,33 +328,44 @@ def build_unknown_spdx_symbol(licenses_db=None):
299328
return LicenseSymbolLike(licenses_db['unknown-spdx'])
300329

301330

302-
def get_cache(force=False, index_all_languages=False):
331+
def get_cache(force=False, index_all_languages=False, additional_directories=None):
303332
"""
304333
Return a LicenseCache either rebuilt, cached or loaded from disk.
305334
306335
If ``index_all_languages`` is True, include texts in all languages when
307336
building the license index. Otherwise, only include the English license \
308337
texts and rules (the default)
309338
"""
310-
populate_cache(force=force, index_all_languages=index_all_languages)
339+
populate_cache(force=force, index_all_languages=index_all_languages, additional_directories=additional_directories)
311340
global _LICENSE_CACHE
312341
return _LICENSE_CACHE
313342

314343

315-
def populate_cache(force=False, index_all_languages=False):
344+
def populate_cache(force=False, index_all_languages=False, additional_directories=None):
316345
"""
317346
Load or build and cache a LicenseCache. Return None.
318347
"""
319348
global _LICENSE_CACHE
320-
if force or not _LICENSE_CACHE:
349+
global _CACHED_DIRECTORIES
350+
# check if we've already cached this set of additional directories
351+
# if we have, pass
352+
should_cache_additional_directories = additional_directories is not None \
353+
and sorted(additional_directories) != sorted(_CACHED_DIRECTORIES)
354+
if should_cache_additional_directories:
355+
# otherwise we will just return previous cache on line 84
356+
force = True
357+
if force or not _LICENSE_CACHE or should_cache_additional_directories:
321358
_LICENSE_CACHE = LicenseCache.load_or_build(
322359
licensedcode_cache_dir=licensedcode_cache_dir,
323360
scancode_cache_dir=scancode_cache_dir,
324361
force=force,
325362
index_all_languages=index_all_languages,
326363
# used for testing only
327364
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
365+
additional_directories=additional_directories,
328366
)
367+
if additional_directories:
368+
_CACHED_DIRECTORIES = additional_directories
329369

330370

331371
def load_cache_file(cache_file):
@@ -346,11 +386,15 @@ def load_cache_file(cache_file):
346386
raise Exception(msg) from e
347387

348388

349-
def get_index(force=False, index_all_languages=False):
389+
def get_index(force=False, index_all_languages=False, additional_directories=None):
350390
"""
351391
Return and eventually build and cache a LicenseIndex.
352392
"""
353-
return get_cache(force=force, index_all_languages=index_all_languages).index
393+
return get_cache(
394+
force=force,
395+
index_all_languages=index_all_languages,
396+
additional_directories=additional_directories
397+
).index
354398

355399

356400
get_cached_index = get_index

src/licensedcode/models.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,64 @@ def get_rules(
768768
licenses_as_rules = build_rules_from_licenses(licenses_db)
769769
return chain(licenses_as_rules, rules)
770770

771+
def get_license_dirs(
772+
additional_dirs,
773+
):
774+
"""
775+
Takes in a list of additional directories specified during license detection
776+
and produces a list of all the subdirectories containing license files.
777+
"""
778+
return [f"{path}/licenses" for path in additional_dirs]
779+
780+
def get_rule_dirs(
781+
additional_dirs,
782+
):
783+
"""
784+
Takes in a list of additional directories specified during license detection
785+
and produces a list of all the subdirectories containing rule files.
786+
"""
787+
return [f"{path}/rules" for path in additional_dirs]
788+
789+
def load_licenses_from_multiple_dirs(
790+
license_directories,
791+
with_deprecated=False,
792+
):
793+
"""
794+
Takes in a list of directories containing additional licenses to use in
795+
license detection and combines all the licenses into the same mapping.
796+
"""
797+
combined_licenses = {}
798+
for license_dir in license_directories:
799+
licenses = load_licenses(licenses_data_dir=license_dir, with_deprecated=False)
800+
# this syntax for merging is described here: https://stackoverflow.com/a/26853961
801+
combined_licenses = {**combined_licenses, **licenses}
802+
return combined_licenses
803+
804+
def get_rules_from_multiple_dirs(
805+
licenses_db,
806+
rule_directories,
807+
):
808+
"""
809+
Takes in a license database, which is a mapping from key->License objects,
810+
and a list of all directories containing rules to use in license detection.
811+
Combines all rules together into the same data structure and validates them.
812+
"""
813+
# TODO: error handling in case additional_directories is empty
814+
if rule_directories:
815+
combined_rules = []
816+
for rules_dir in rule_directories:
817+
r = list(load_rules(
818+
rules_data_dir=rules_dir,
819+
))
820+
combined_rules.append(r)
821+
rules = chain.from_iterable(combined_rules)
822+
else:
823+
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
824+
825+
validate_rules(rules, licenses_db)
826+
licenses_as_rules = build_rules_from_licenses(licenses_db)
827+
return chain(licenses_as_rules, rules)
828+
771829

772830
class InvalidRule(Exception):
773831
pass

src/licensedcode/plugin_license.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from commoncode.resource import clean_path
1919
from plugincode.scan import ScanPlugin
2020
from plugincode.scan import scan_impl
21+
import click
2122

2223
from scancode.api import SCANCODE_LICENSEDB_URL
2324

@@ -139,6 +140,14 @@ class LicenseScanner(ScanPlugin):
139140
help_group=SCAN_OPTIONS_GROUP,
140141
),
141142

143+
PluggableCommandLineOption(
144+
('-dir', '--additional_directories'),
145+
multiple=True,
146+
type=click.Path(exists=True, readable=True, path_type=str),
147+
help='Include additional directories for license detection.',
148+
help_group=SCAN_OPTIONS_GROUP,
149+
),
150+
142151
PluggableCommandLineOption(
143152
('--reindex-licenses',),
144153
is_flag=True, is_eager=True,
@@ -167,7 +176,8 @@ def setup(self, **kwargs):
167176
loaded index.
168177
"""
169178
from licensedcode.cache import populate_cache
170-
populate_cache()
179+
additional_directories = kwargs.get('additional_directories')
180+
populate_cache(additional_directories=additional_directories)
171181

172182
def get_scanner(
173183
self,
@@ -176,6 +186,7 @@ def get_scanner(
176186
license_text_diagnostics=False,
177187
license_url_template=SCANCODE_LICENSEDB_URL,
178188
unknown_licenses=False,
189+
additional_directories=None,
179190
**kwargs
180191
):
181192

@@ -186,6 +197,7 @@ def get_scanner(
186197
license_text_diagnostics=license_text_diagnostics,
187198
license_url_template=license_url_template,
188199
unknown_licenses=unknown_licenses,
200+
additional_directories=additional_directories,
189201
)
190202

191203
def process_codebase(self, codebase, unknown_licenses, **kwargs):

src/scancode/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def get_licenses(
142142
license_url_template=SCANCODE_LICENSEDB_URL,
143143
unknown_licenses=False,
144144
deadline=sys.maxsize,
145+
additional_directories=None,
145146
**kwargs,
146147
):
147148
"""
@@ -168,7 +169,7 @@ def get_licenses(
168169
from licensedcode import cache
169170
from licensedcode.spans import Span
170171

171-
idx = cache.get_index()
172+
idx = cache.get_index(additional_directories=additional_directories)
172173

173174
detected_licenses = []
174175
detected_expressions = []
@@ -252,6 +253,7 @@ def _licenses_data_from_match(
252253
result['homepage_url'] = lic.homepage_url
253254
result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
254255
result['reference_url'] = license_url_template.format(lic.key)
256+
# TODO: change this in the case of a private license?
255257
result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
256258
result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key)
257259

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The quick brown fox jumps over the lazy dog.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example1
2+
short_name: Example External License 1
3+
name: Example External License 1
4+
category: Permissive
5+
owner: NexB
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The quick brown fox jumps over the lazy dog.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expression: example1
2+
is_license_text: yes
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
{
2+
"headers": [
3+
{
4+
"tool_name": "scancode-toolkit",
5+
"options": {
6+
"input": "<path>",
7+
"-dir": "<path>",
8+
"--json": "<file>",
9+
"--license": true,
10+
"--strip-root": true
11+
},
12+
"notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
13+
"output_format_version": "2.0.0",
14+
"message": null,
15+
"errors": [],
16+
"warnings": [],
17+
"extra_data": {
18+
"system_environment": {
19+
"operating_system": "linux",
20+
"cpu_architecture": "64",
21+
"platform": "Linux-5.4.0-109-generic-x86_64-with-Ubuntu-18.04-bionic",
22+
"platform_version": "#123~18.04.1-Ubuntu SMP Fri Apr 8 09:48:52 UTC 2022",
23+
"python_version": "3.6.9 (default, Mar 15 2022, 13:55:28) \n[GCC 8.4.0]"
24+
},
25+
"spdx_license_list_version": "3.16",
26+
"files_count": 2
27+
}
28+
}
29+
],
30+
"files": [
31+
{
32+
"path": "license.txt",
33+
"type": "file",
34+
"licenses": [
35+
{
36+
"key": "example1",
37+
"score": 100.0,
38+
"name": "Example External License 1",
39+
"short_name": "Example External License 1",
40+
"category": "Permissive",
41+
"is_exception": false,
42+
"is_unknown": false,
43+
"owner": "NexB",
44+
"homepage_url": "",
45+
"text_url": "",
46+
"reference_url": "https://scancode-licensedb.aboutcode.org/example1",
47+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/example1.LICENSE",
48+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/example1.yml",
49+
"spdx_license_key": "",
50+
"spdx_url": "",
51+
"start_line": 1,
52+
"end_line": 1,
53+
"matched_rule": {
54+
"identifier": "example1.LICENSE",
55+
"license_expression": "example1",
56+
"licenses": [
57+
"example1"
58+
],
59+
"referenced_filenames": [],
60+
"is_license_text": true,
61+
"is_license_notice": false,
62+
"is_license_reference": false,
63+
"is_license_tag": false,
64+
"is_license_intro": false,
65+
"has_unknown": false,
66+
"matcher": "1-hash",
67+
"rule_length": 9,
68+
"matched_length": 9,
69+
"match_coverage": 100.0,
70+
"rule_relevance": 100
71+
}
72+
}
73+
],
74+
"license_expressions": [
75+
"example1"
76+
],
77+
"percentage_of_license_text": 100.0,
78+
"scan_errors": []
79+
}
80+
]
81+
}

0 commit comments

Comments
 (0)