Skip to content

Commit 93e90a4

Browse files
committed
Add support for external licenses in scans #480
This adds `-dir` or `--additional-directories` as a command line option in license detection. This allows users to specify paths to directories of licenses and rules they'd like to use during license detection, but would not like to add to the ScanCode database of licenses. This involves adding a new option in `licensedcode/plugin_license.py`, and this option is used as a parameter in `scancode/api.py`. In this approach, the licenses and rules contained in these additional directories are combined with the existing licenses and rules in the ScanCode database to produce a single index. The code for this is found in `licensedcode/cache.py` and the helper methods for loading these licenses and rules are found in `licensedcode/models.py`. This commit also includes a unit test to verify that license detection succeeds with an additional directory found in `tests/licensedcode/test_plugin_license.py`. Part of the setup for the unit test and future tests involves creating a new directory in `tests/licensedcode/data` that contains sample external licenses used in the unit tests. Signed-off-by: Kevin Ji <[email protected]>
1 parent aba3112 commit 93e90a4

File tree

17 files changed

+419
-29
lines changed

17 files changed

+419
-29
lines changed

src/licensedcode/cache.py

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
# global in-memory cache of the LicenseCache
3131
_LICENSE_CACHE = None
32+
# list of all additional directories included in the license cache
33+
_CACHED_DIRECTORIES = []
3234

3335
LICENSE_INDEX_LOCK_TIMEOUT = 60 * 4
3436
LICENSE_INDEX_DIR = 'license_index'
@@ -58,6 +60,7 @@ def load_or_build(
5860
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
5961
licenses_data_dir=None,
6062
rules_data_dir=None,
63+
additional_directories=None,
6164
):
6265
"""
6366
Load or build and save and return a LicenseCache object.
@@ -92,6 +95,8 @@ def load_or_build(
9295
from licensedcode.models import licenses_data_dir as ldd
9396
from licensedcode.models import rules_data_dir as rdd
9497
from licensedcode.models import load_licenses
98+
from licensedcode.models import load_licenses_from_multiple_dirs
99+
from licensedcode.models import get_license_dirs
95100
from scancode import lockfile
96101

97102
licenses_data_dir = licenses_data_dir or ldd
@@ -106,13 +111,21 @@ def load_or_build(
106111
# Here, the cache is either stale or non-existing: we need to
107112
# rebuild all cached data (e.g. mostly the index) and cache it
108113

109-
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
114+
if additional_directories:
115+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
116+
combined_directories = [licenses_data_dir] + additional_license_dirs
117+
licenses_db = load_licenses_from_multiple_dirs(license_directories=combined_directories)
118+
else:
119+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
110120

121+
# create a single merged index containing license data from licenses_data_dir
122+
# and data from additional directories
111123
index = build_index(
112124
licenses_db=licenses_db,
113125
licenses_data_dir=licenses_data_dir,
114126
rules_data_dir=rules_data_dir,
115127
index_all_languages=index_all_languages,
128+
additional_directories=additional_directories,
116129
)
117130

118131
spdx_symbols = build_spdx_symbols(licenses_db=licenses_db)
@@ -143,27 +156,50 @@ def build_index(
143156
licenses_data_dir=None,
144157
rules_data_dir=None,
145158
index_all_languages=False,
159+
additional_directories=None,
146160
):
147161
"""
148162
Return an index built from rules and licenses directories
149163
150164
If ``index_all_languages`` is True, include texts and rules in all languages.
151165
Otherwise, only include the English license texts and rules (the default)
166+
If ``additional_directories`` is not None, we will include licenses and rules
167+
from these additional directories in the returned index.
152168
"""
153169
from licensedcode.index import LicenseIndex
170+
from licensedcode.models import get_license_dirs
171+
from licensedcode.models import get_rule_dirs
154172
from licensedcode.models import get_rules
173+
from licensedcode.models import get_rules_from_multiple_dirs
155174
from licensedcode.models import get_all_spdx_key_tokens
156175
from licensedcode.models import get_license_tokens
157176
from licensedcode.models import licenses_data_dir as ldd
158177
from licensedcode.models import rules_data_dir as rdd
159178
from licensedcode.models import load_licenses
179+
from licensedcode.models import load_licenses_from_multiple_dirs
160180
from licensedcode.legalese import common_license_words
161181

162182
licenses_data_dir = licenses_data_dir or ldd
163183
rules_data_dir = rules_data_dir or rdd
164184

165-
licenses_db = licenses_db or load_licenses(licenses_data_dir=licenses_data_dir)
166-
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
185+
if not licenses_db:
186+
if additional_directories:
187+
# combine the licenses in these additional directories with the licenses in the original DB
188+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
189+
combined_license_directories = [licenses_data_dir] + additional_license_dirs
190+
# generate a single combined license db with all licenses
191+
licenses_db = load_licenses_from_multiple_dirs(license_dirs=combined_license_directories)
192+
else:
193+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
194+
195+
if additional_directories:
196+
# if we have additional directories, extract the rules from them
197+
additional_rule_dirs = get_rule_dirs(additional_dirs=additional_directories)
198+
# then combine the rules in these additional directories with the rules in the original rules directory
199+
combined_rule_directories = [rules_data_dir] + additional_rule_dirs
200+
rules = get_rules_from_multiple_dirs(licenses_db=licenses_db, rule_directories=combined_rule_directories)
201+
else:
202+
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
167203

168204
legalese = common_license_words
169205
spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
@@ -299,33 +335,42 @@ def build_unknown_spdx_symbol(licenses_db=None):
299335
return LicenseSymbolLike(licenses_db['unknown-spdx'])
300336

301337

302-
def get_cache(force=False, index_all_languages=False):
338+
def get_cache(force=False, index_all_languages=False, additional_directories=None):
303339
"""
304340
Return a LicenseCache either rebuilt, cached or loaded from disk.
305341
306342
If ``index_all_languages`` is True, include texts in all languages when
307343
building the license index. Otherwise, only include the English license \
308344
texts and rules (the default)
309345
"""
310-
populate_cache(force=force, index_all_languages=index_all_languages)
346+
populate_cache(force=force, index_all_languages=index_all_languages, additional_directories=additional_directories)
311347
global _LICENSE_CACHE
312348
return _LICENSE_CACHE
313349

314350

315-
def populate_cache(force=False, index_all_languages=False):
351+
def populate_cache(force=False, index_all_languages=False, additional_directories=None):
316352
"""
317353
Load or build and cache a LicenseCache. Return None.
318354
"""
319355
global _LICENSE_CACHE
320-
if force or not _LICENSE_CACHE:
356+
global _CACHED_DIRECTORIES
357+
should_rebuild_cache = additional_directories is not None \
358+
and sorted(additional_directories) != sorted(_CACHED_DIRECTORIES)
359+
if should_rebuild_cache:
360+
# otherwise we will just return previous cache on line 84
361+
force = True
362+
if force or not _LICENSE_CACHE or should_rebuild_cache:
321363
_LICENSE_CACHE = LicenseCache.load_or_build(
322364
licensedcode_cache_dir=licensedcode_cache_dir,
323365
scancode_cache_dir=scancode_cache_dir,
324366
force=force,
325367
index_all_languages=index_all_languages,
326368
# used for testing only
327369
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
370+
additional_directories=additional_directories,
328371
)
372+
if additional_directories:
373+
_CACHED_DIRECTORIES = additional_directories
329374

330375

331376
def load_cache_file(cache_file):
@@ -346,11 +391,15 @@ def load_cache_file(cache_file):
346391
raise Exception(msg) from e
347392

348393

349-
def get_index(force=False, index_all_languages=False):
394+
def get_index(force=False, index_all_languages=False, additional_directories=None):
350395
"""
351396
Return and eventually build and cache a LicenseIndex.
352397
"""
353-
return get_cache(force=force, index_all_languages=index_all_languages).index
398+
return get_cache(
399+
force=force,
400+
index_all_languages=index_all_languages,
401+
additional_directories=additional_directories
402+
).index
354403

355404

356405
get_cached_index = get_index

src/licensedcode/models.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,63 @@ def get_rules(
768768
licenses_as_rules = build_rules_from_licenses(licenses_db)
769769
return chain(licenses_as_rules, rules)
770770

771+
def get_license_dirs(
772+
additional_dirs,
773+
):
774+
"""
775+
Takes in a list of additional directories specified during license detection
776+
and produces a list of all the subdirectories containing license files.
777+
"""
778+
return [f"{path}/licenses" for path in additional_dirs]
779+
780+
def get_rule_dirs(
781+
additional_dirs,
782+
):
783+
"""
784+
Takes in a list of additional directories specified during license detection
785+
and produces a list of all the subdirectories containing rule files.
786+
"""
787+
return [f"{path}/rules" for path in additional_dirs]
788+
789+
def load_licenses_from_multiple_dirs(
790+
license_directories,
791+
with_deprecated=False,
792+
):
793+
"""
794+
Takes in a list of directories containing additional licenses to use in
795+
license detection and combines all the licenses into the same mapping.
796+
"""
797+
combined_licenses = {}
798+
for license_dir in license_directories:
799+
licenses = load_licenses(licenses_data_dir=license_dir, with_deprecated=False)
800+
# this syntax for merging is described here: https://stackoverflow.com/a/26853961
801+
combined_licenses = {**combined_licenses, **licenses}
802+
return combined_licenses
803+
804+
def get_rules_from_multiple_dirs(
805+
licenses_db,
806+
rule_directories,
807+
):
808+
"""
809+
Takes in a license database, which is a mapping from key->License objects,
810+
and a list of all directories containing rules to use in license detection.
811+
Combines all rules together into the same data structure and validates them.
812+
"""
813+
if rule_directories:
814+
combined_rules = []
815+
for rules_dir in rule_directories:
816+
r = list(load_rules(
817+
rules_data_dir=rules_dir,
818+
))
819+
combined_rules.append(r)
820+
# flatten lists of rules into a single iterable
821+
rules = list(chain.from_iterable(combined_rules))
822+
validate_rules(rules, licenses_db)
823+
licenses_as_rules = build_rules_from_licenses(licenses_db)
824+
return chain(licenses_as_rules, rules)
825+
else:
826+
return get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
827+
771828

772829
class InvalidRule(Exception):
773830
pass

src/licensedcode/plugin_license.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from commoncode.resource import clean_path
1919
from plugincode.scan import ScanPlugin
2020
from plugincode.scan import scan_impl
21+
import click
2122

2223
from scancode.api import SCANCODE_LICENSEDB_URL
2324

@@ -139,6 +140,14 @@ class LicenseScanner(ScanPlugin):
139140
help_group=SCAN_OPTIONS_GROUP,
140141
),
141142

143+
PluggableCommandLineOption(
144+
('-dir', '--additional_directories'),
145+
multiple=True,
146+
type=click.Path(exists=True, readable=True, path_type=str),
147+
help='Include additional directories for license detection.',
148+
help_group=SCAN_OPTIONS_GROUP,
149+
),
150+
142151
PluggableCommandLineOption(
143152
('--reindex-licenses',),
144153
is_flag=True, is_eager=True,
@@ -167,7 +176,8 @@ def setup(self, **kwargs):
167176
loaded index.
168177
"""
169178
from licensedcode.cache import populate_cache
170-
populate_cache()
179+
additional_directories = kwargs.get('additional_directories')
180+
populate_cache(additional_directories=additional_directories)
171181

172182
def get_scanner(
173183
self,
@@ -176,6 +186,7 @@ def get_scanner(
176186
license_text_diagnostics=False,
177187
license_url_template=SCANCODE_LICENSEDB_URL,
178188
unknown_licenses=False,
189+
additional_directories=None,
179190
**kwargs
180191
):
181192

@@ -186,6 +197,7 @@ def get_scanner(
186197
license_text_diagnostics=license_text_diagnostics,
187198
license_url_template=license_url_template,
188199
unknown_licenses=unknown_licenses,
200+
additional_directories=additional_directories,
189201
)
190202

191203
def process_codebase(self, codebase, unknown_licenses, **kwargs):

src/scancode/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def get_licenses(
142142
license_url_template=SCANCODE_LICENSEDB_URL,
143143
unknown_licenses=False,
144144
deadline=sys.maxsize,
145+
additional_directories=None,
145146
**kwargs,
146147
):
147148
"""
@@ -168,7 +169,7 @@ def get_licenses(
168169
from licensedcode import cache
169170
from licensedcode.spans import Span
170171

171-
idx = cache.get_index()
172+
idx = cache.get_index(additional_directories=additional_directories)
172173

173174
detected_licenses = []
174175
detected_expressions = []
@@ -252,6 +253,7 @@ def _licenses_data_from_match(
252253
result['homepage_url'] = lic.homepage_url
253254
result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
254255
result['reference_url'] = license_url_template.format(lic.key)
256+
# TODO: change this in the case of a private license?
255257
result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
256258
result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key)
257259

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The quick brown fox jumps over the lazy dog.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example1
2+
short_name: Example External License 1
3+
name: Example External License 1
4+
category: Permissive
5+
owner: NexB
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The quick brown fox jumps over the lazy dog.
2+
The quick brown fox jumps over the lazy dog.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expression: example1
2+
is_license_text: yes
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit,
2+
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
3+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi
4+
ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
5+
in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
6+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia
7+
deserunt mollit anim id est laborum.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example2
2+
short_name: Example External License 2
3+
name: Example External License 2
4+
category: Permissive
5+
owner: NexB
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expression: example2
2+
is_license_text: yes

0 commit comments

Comments
 (0)