Skip to content

Commit 7696b97

Browse files
committed
Add support for external licenses in scans #480
This adds `-dir` or `--additional-directories` as a command line option in license detection. This allows users to specify paths to directories of licenses and rules they'd like to use during license detection, but would not like to add to the ScanCode database of licenses. This involves adding a new option in `licensedcode/plugin_license.py`, and this option is used as a parameter in `scancode/api.py`. In this approach, the licenses and rules contained in these additional directories are combined with the existing licenses and rules in the ScanCode database to produce a single index. The code for this is found in `licensedcode/cache.py` and the helper methods for loading these licenses and rules are found in `licensedcode/models.py`. This commit also includes a unit test to verify that license detection succeeds with an additional directory found in `tests/licensedcode/test_plugin_license.py`. Part of the setup for the unit test and future tests involves creating a new directory in `tests/licensedcode/data` that contains sample external licenses used in the unit tests. Signed-off-by: Kevin Ji <[email protected]>
1 parent aba3112 commit 7696b97

File tree

17 files changed

+425
-29
lines changed

17 files changed

+425
-29
lines changed

src/licensedcode/cache.py

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
# global in-memory cache of the LicenseCache
3131
_LICENSE_CACHE = None
32+
# list of all additional directories included in the license cache
33+
_CACHED_DIRECTORIES = []
3234

3335
LICENSE_INDEX_LOCK_TIMEOUT = 60 * 4
3436
LICENSE_INDEX_DIR = 'license_index'
@@ -58,6 +60,7 @@ def load_or_build(
5860
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
5961
licenses_data_dir=None,
6062
rules_data_dir=None,
63+
additional_directories=None,
6164
):
6265
"""
6366
Load or build and save and return a LicenseCache object.
@@ -92,6 +95,8 @@ def load_or_build(
9295
from licensedcode.models import licenses_data_dir as ldd
9396
from licensedcode.models import rules_data_dir as rdd
9497
from licensedcode.models import load_licenses
98+
from licensedcode.models import load_licenses_from_multiple_dirs
99+
from licensedcode.models import get_license_dirs
95100
from scancode import lockfile
96101

97102
licenses_data_dir = licenses_data_dir or ldd
@@ -106,13 +111,21 @@ def load_or_build(
106111
# Here, the cache is either stale or non-existing: we need to
107112
# rebuild all cached data (e.g. mostly the index) and cache it
108113

109-
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
114+
if additional_directories:
115+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
116+
combined_directories = [licenses_data_dir] + additional_license_dirs
117+
licenses_db = load_licenses_from_multiple_dirs(license_directories=combined_directories)
118+
else:
119+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
110120

121+
# create a single merged index containing license data from licenses_data_dir
122+
# and data from additional directories
111123
index = build_index(
112124
licenses_db=licenses_db,
113125
licenses_data_dir=licenses_data_dir,
114126
rules_data_dir=rules_data_dir,
115127
index_all_languages=index_all_languages,
128+
additional_directories=additional_directories,
116129
)
117130

118131
spdx_symbols = build_spdx_symbols(licenses_db=licenses_db)
@@ -143,27 +156,50 @@ def build_index(
143156
licenses_data_dir=None,
144157
rules_data_dir=None,
145158
index_all_languages=False,
159+
additional_directories=None,
146160
):
147161
"""
148162
Return an index built from rules and licenses directories
149163
150164
If ``index_all_languages`` is True, include texts and rules in all languages.
151165
Otherwise, only include the English license texts and rules (the default)
166+
If ``additional_directories`` is not None, we will include licenses and rules
167+
from these additional directories in the returned index.
152168
"""
153169
from licensedcode.index import LicenseIndex
170+
from licensedcode.models import get_license_dirs
171+
from licensedcode.models import get_rule_dirs
154172
from licensedcode.models import get_rules
173+
from licensedcode.models import get_rules_from_multiple_dirs
155174
from licensedcode.models import get_all_spdx_key_tokens
156175
from licensedcode.models import get_license_tokens
157176
from licensedcode.models import licenses_data_dir as ldd
158177
from licensedcode.models import rules_data_dir as rdd
159178
from licensedcode.models import load_licenses
179+
from licensedcode.models import load_licenses_from_multiple_dirs
160180
from licensedcode.legalese import common_license_words
161181

162182
licenses_data_dir = licenses_data_dir or ldd
163183
rules_data_dir = rules_data_dir or rdd
164184

165-
licenses_db = licenses_db or load_licenses(licenses_data_dir=licenses_data_dir)
166-
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
185+
if not licenses_db:
186+
if additional_directories:
187+
# combine the licenses in these additional directories with the licenses in the original DB
188+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
189+
combined_license_directories = [licenses_data_dir] + additional_license_dirs
190+
# generate a single combined license db with all licenses
191+
licenses_db = load_licenses_from_multiple_dirs(license_dirs=combined_license_directories)
192+
else:
193+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
194+
195+
if additional_directories:
196+
# if we have additional directories, extract the rules from them
197+
additional_rule_dirs = get_rule_dirs(additional_dirs=additional_directories)
198+
# then combine the rules in these additional directories with the rules in the original rules directory
199+
combined_rule_directories = [rules_data_dir] + additional_rule_dirs
200+
rules = get_rules_from_multiple_dirs(licenses_db=licenses_db, rule_directories=combined_rule_directories)
201+
else:
202+
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
167203

168204
legalese = common_license_words
169205
spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
@@ -299,33 +335,45 @@ def build_unknown_spdx_symbol(licenses_db=None):
299335
return LicenseSymbolLike(licenses_db['unknown-spdx'])
300336

301337

302-
def get_cache(force=False, index_all_languages=False):
338+
def get_cache(force=False, index_all_languages=False, additional_directories=None):
303339
"""
304340
Return a LicenseCache either rebuilt, cached or loaded from disk.
305341
306342
If ``index_all_languages`` is True, include texts in all languages when
307343
building the license index. Otherwise, only include the English license \
308344
texts and rules (the default)
309345
"""
310-
populate_cache(force=force, index_all_languages=index_all_languages)
346+
populate_cache(force=force, index_all_languages=index_all_languages, additional_directories=additional_directories)
311347
global _LICENSE_CACHE
312348
return _LICENSE_CACHE
313349

314350

315-
def populate_cache(force=False, index_all_languages=False):
351+
def populate_cache(force=False, index_all_languages=False, additional_directories=None):
316352
"""
317353
Load or build and cache a LicenseCache. Return None.
318354
"""
319355
global _LICENSE_CACHE
320-
if force or not _LICENSE_CACHE:
356+
global _CACHED_DIRECTORIES
357+
# TODO: seems like every time you rerun scancode it reinitializes everything
358+
# so in the documentation we'd need to say "just run once with the -dir option and
359+
# never run again until you want to add new directories"
360+
should_rebuild_cache = additional_directories is not None \
361+
and sorted(additional_directories) != sorted(_CACHED_DIRECTORIES)
362+
if should_rebuild_cache:
363+
# otherwise we will just return previous cache on line 84
364+
force = True
365+
if force or not _LICENSE_CACHE or should_rebuild_cache:
321366
_LICENSE_CACHE = LicenseCache.load_or_build(
322367
licensedcode_cache_dir=licensedcode_cache_dir,
323368
scancode_cache_dir=scancode_cache_dir,
324369
force=force,
325370
index_all_languages=index_all_languages,
326371
# used for testing only
327372
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
373+
additional_directories=additional_directories,
328374
)
375+
if additional_directories:
376+
_CACHED_DIRECTORIES = additional_directories
329377

330378

331379
def load_cache_file(cache_file):
@@ -346,11 +394,15 @@ def load_cache_file(cache_file):
346394
raise Exception(msg) from e
347395

348396

349-
def get_index(force=False, index_all_languages=False):
397+
def get_index(force=False, index_all_languages=False, additional_directories=None):
350398
"""
351399
Return and eventually build and cache a LicenseIndex.
352400
"""
353-
return get_cache(force=force, index_all_languages=index_all_languages).index
401+
return get_cache(
402+
force=force,
403+
index_all_languages=index_all_languages,
404+
additional_directories=additional_directories
405+
).index
354406

355407

356408
get_cached_index = get_index

src/licensedcode/models.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from os.path import dirname
2121
from os.path import exists
2222
from os.path import join
23+
from pathlib import Path
2324

2425
import attr
2526
import saneyaml
@@ -768,6 +769,64 @@ def get_rules(
768769
licenses_as_rules = build_rules_from_licenses(licenses_db)
769770
return chain(licenses_as_rules, rules)
770771

772+
def get_license_dirs(
773+
additional_dirs,
774+
):
775+
"""
776+
Takes in a list of additional directories specified during license detection
777+
and produces a list of all the subdirectories containing license files.
778+
"""
779+
# convert to absolute path in case user passes in a relative path, which messes up building rules from licenses
780+
return [f"{str(Path(path).absolute())}/licenses" for path in additional_dirs]
781+
782+
def get_rule_dirs(
783+
additional_dirs,
784+
):
785+
"""
786+
Takes in a list of additional directories specified during license detection
787+
and produces a list of all the subdirectories containing rule files.
788+
"""
789+
return [f"{str(Path(path).absolute())}/rules" for path in additional_dirs]
790+
791+
def load_licenses_from_multiple_dirs(
792+
license_directories,
793+
with_deprecated=False,
794+
):
795+
"""
796+
Takes in a list of directories containing additional licenses to use in
797+
license detection and combines all the licenses into the same mapping.
798+
"""
799+
combined_licenses = {}
800+
for license_dir in license_directories:
801+
licenses = load_licenses(licenses_data_dir=license_dir, with_deprecated=False)
802+
# this syntax for merging is described here: https://stackoverflow.com/a/26853961
803+
combined_licenses = {**combined_licenses, **licenses}
804+
return combined_licenses
805+
806+
def get_rules_from_multiple_dirs(
807+
licenses_db,
808+
rule_directories,
809+
):
810+
"""
811+
Takes in a license database, which is a mapping from key->License objects,
812+
and a list of all directories containing rules to use in license detection.
813+
Combines all rules together into the same data structure and validates them.
814+
"""
815+
if rule_directories:
816+
combined_rules = []
817+
for rules_dir in rule_directories:
818+
r = list(load_rules(
819+
rules_data_dir=rules_dir,
820+
))
821+
combined_rules.append(r)
822+
# flatten lists of rules into a single iterable
823+
rules = list(chain.from_iterable(combined_rules))
824+
validate_rules(rules, licenses_db)
825+
licenses_as_rules = build_rules_from_licenses(licenses_db)
826+
return chain(licenses_as_rules, rules)
827+
else:
828+
return get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
829+
771830

772831
class InvalidRule(Exception):
773832
pass

src/licensedcode/plugin_license.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from commoncode.resource import clean_path
1919
from plugincode.scan import ScanPlugin
2020
from plugincode.scan import scan_impl
21+
import click
2122

2223
from scancode.api import SCANCODE_LICENSEDB_URL
2324

@@ -139,6 +140,15 @@ class LicenseScanner(ScanPlugin):
139140
help_group=SCAN_OPTIONS_GROUP,
140141
),
141142

143+
PluggableCommandLineOption(
144+
('-dir', '--additional_directories'),
145+
required_options=['license'],
146+
multiple=True,
147+
type=click.Path(exists=True, readable=True, path_type=str),
148+
help='Include additional directories for license detection.',
149+
help_group=SCAN_OPTIONS_GROUP,
150+
),
151+
142152
PluggableCommandLineOption(
143153
('--reindex-licenses',),
144154
is_flag=True, is_eager=True,
@@ -167,7 +177,8 @@ def setup(self, **kwargs):
167177
loaded index.
168178
"""
169179
from licensedcode.cache import populate_cache
170-
populate_cache()
180+
additional_directories = kwargs.get('additional_directories')
181+
populate_cache(additional_directories=additional_directories)
171182

172183
def get_scanner(
173184
self,
@@ -176,6 +187,7 @@ def get_scanner(
176187
license_text_diagnostics=False,
177188
license_url_template=SCANCODE_LICENSEDB_URL,
178189
unknown_licenses=False,
190+
additional_directories=None,
179191
**kwargs
180192
):
181193

@@ -186,6 +198,7 @@ def get_scanner(
186198
license_text_diagnostics=license_text_diagnostics,
187199
license_url_template=license_url_template,
188200
unknown_licenses=unknown_licenses,
201+
additional_directories=additional_directories,
189202
)
190203

191204
def process_codebase(self, codebase, unknown_licenses, **kwargs):

src/scancode/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def get_licenses(
142142
license_url_template=SCANCODE_LICENSEDB_URL,
143143
unknown_licenses=False,
144144
deadline=sys.maxsize,
145+
additional_directories=None,
145146
**kwargs,
146147
):
147148
"""
@@ -168,7 +169,7 @@ def get_licenses(
168169
from licensedcode import cache
169170
from licensedcode.spans import Span
170171

171-
idx = cache.get_index()
172+
idx = cache.get_index(additional_directories=additional_directories)
172173

173174
detected_licenses = []
174175
detected_expressions = []
@@ -252,6 +253,7 @@ def _licenses_data_from_match(
252253
result['homepage_url'] = lic.homepage_url
253254
result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
254255
result['reference_url'] = license_url_template.format(lic.key)
256+
# TODO: change this in the case of a private license?
255257
result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
256258
result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key)
257259

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The quick brown fox jumps over the lazy dog.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example1
2+
short_name: Example External License 1
3+
name: Example External License 1
4+
category: Permissive
5+
owner: NexB
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The quick brown fox jumps over the lazy dog.
2+
The quick brown fox jumps over the lazy dog.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expression: example1
2+
is_license_text: yes
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit,
2+
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
3+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi
4+
ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
5+
in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
6+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia
7+
deserunt mollit anim id est laborum.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example2
2+
short_name: Example External License 2
3+
name: Example External License 2
4+
category: Permissive
5+
owner: NexB
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expression: example2
2+
is_license_text: yes

0 commit comments

Comments
 (0)