Skip to content

Commit 924395d

Browse files
Fix license detection bugs
* Top level packages were inconsistant because of license plugin post processing running after the package plugin post processing. * Adjust license clues/false-postives/bad matches heuristics. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 79871d3 commit 924395d

File tree

8 files changed

+237
-631
lines changed

8 files changed

+237
-631
lines changed

etc/scripts/licenses/buildrules-template.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ is_license_clue: yes
1212
referenced_filenames:
1313
notes:
1414
---
15-
15+
The only changes we've made are to
16+
accommodate the licenses we use, which are GPLv3 and LGPLv3 (or later)
17+
whereas the Linux kernel uses GPLv2.
1618
----------------------------------------
1719
license_expression:
1820
relevance: 100

src/licensedcode/detection.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from commoncode.text import python_safe_name
2626
from licensedcode.cache import get_index
2727
from licensedcode.cache import get_cache
28+
from licensedcode.cache import build_spdx_license_expression
2829
from licensedcode.match import LicenseMatch
2930
from licensedcode.match import set_matched_lines
3031
from licensedcode.models import UnDetectedRule
@@ -105,6 +106,7 @@ class DetectionCategory(Enum):
105106
EXTRA_WORDS = 'extra-words'
106107
UNKNOWN_MATCH = 'unknown-match'
107108
LICENSE_CLUES = 'license-clues'
109+
LOW_QUALITY_MATCHES = 'license-clues'
108110
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
109111
FALSE_POSITVE = 'possible-false-positive'
110112
UNDETECTED_LICENSE = 'undetected-license'
@@ -730,6 +732,10 @@ def collect_license_detections(codebase, include_license_clues=True):
730732
)
731733
if not detection_is_same:
732734
package["declared_license_expression"] = license_expression
735+
package["declared_license_expression_spdx"] = str(build_spdx_license_expression(
736+
license_expression=license_expression,
737+
licensing=get_cache().licensing,
738+
))
733739
modified = True
734740

735741
other_license_detections = package["other_license_detections"]
@@ -741,6 +747,10 @@ def collect_license_detections(codebase, include_license_clues=True):
741747
)
742748
if not detection_is_same:
743749
package["other_license_expression"] = license_expression
750+
package["other_license_expression_spdx"] = str(build_spdx_license_expression(
751+
license_expression=license_expression,
752+
licensing=get_cache().licensing,
753+
))
744754
modified = True
745755

746756
if modified:
@@ -753,6 +763,30 @@ def collect_license_detections(codebase, include_license_clues=True):
753763
)
754764
all_license_detections.extend(package_license_detection_objects)
755765

766+
if has_packages and has_licenses:
767+
for package in getattr(codebase.attributes, 'packages', []):
768+
license_expression_package = package["declared_license_expression"]
769+
if not license_expression:
770+
continue
771+
772+
resource_paths = package["datafile_paths"]
773+
if len(resource_paths) == 1:
774+
resource_path = resource_paths[0]
775+
else:
776+
#TODO: implement the correct consistency check
777+
# based on which datafile path the license came from
778+
resource_path = resource_paths[0]
779+
resource = codebase.get_resource(path=resource_path)
780+
resource_packages = getattr(resource, 'package_data', None)
781+
if not resource_packages or len(resource_packages) > 1:
782+
continue
783+
784+
resource_package = resource_packages[0]
785+
if license_expression_package != resource_package["declared_license_expression"]:
786+
package["license_detections"] = resource_package["license_detections"]
787+
package["declared_license_expression"] = resource_package["declared_license_expression"]
788+
package["declared_license_expression_spdx"] = resource_package["declared_license_expression_spdx"]
789+
756790
return all_license_detections
757791

758792

@@ -1107,7 +1141,7 @@ def has_correct_license_clue_matches(license_matches):
11071141
return is_correct_detection(license_matches) and all(match.rule.is_license_clue for match in license_matches)
11081142

11091143

1110-
def is_license_clues(license_matches):
1144+
def is_low_quality_matches(license_matches):
11111145
"""
11121146
Return True if the license_matches are not part of a correct
11131147
license detection and are mere license clues.
@@ -1329,6 +1363,14 @@ def get_detected_license_expression(
13291363
detection_log.append(DetectionRule.LICENSE_CLUES.value)
13301364
return detection_log, combined_expression
13311365

1366+
elif analysis == DetectionCategory.LOW_QUALITY_MATCHES.value:
1367+
if TRACE_ANALYSIS:
1368+
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
1369+
# TODO: we are temporarily returning these as license clues, and not
1370+
# in detections but ideally we should return synthetic unknowns for these
1371+
detection_log.append(DetectionRule.LOW_QUALITY_MATCHES.value)
1372+
return detection_log, combined_expression
1373+
13321374
else:
13331375
if TRACE_ANALYSIS:
13341376
logger_debug(f'analysis not-combined')
@@ -1510,8 +1552,8 @@ def analyze_detection(license_matches, package_license=False):
15101552
elif has_unknown_matches(license_matches=license_matches):
15111553
return DetectionCategory.UNKNOWN_MATCH.value
15121554

1513-
elif is_license_clues(license_matches=license_matches):
1514-
return DetectionCategory.LICENSE_CLUES.value
1555+
elif not package_license and is_low_quality_matches(license_matches=license_matches):
1556+
return DetectionCategory.LOW_QUALITY_MATCHES.value
15151557

15161558
# Case where at least one of the matches have `match_coverage`
15171559
# below IMPERFECT_MATCH_COVERAGE_THR
@@ -1644,19 +1686,24 @@ def process_detections(detections, licensing=Licensing()):
16441686

16451687
for detection in detections:
16461688
if detection.license_expression == None:
1647-
license_keys = licensing.license_keys(expression=detection.license_expression)
1648-
if all(
1649-
key in detected_license_keys
1650-
for key in license_keys
1651-
):
1652-
detection.license_expression = str(combine_expressions(
1689+
if has_correct_license_clue_matches(detection.matches):
1690+
continue
1691+
1692+
license_expression = str(combine_expressions(
16531693
expressions=[
16541694
match.rule.license_expression
16551695
for match in detection.matches
16561696
],
16571697
unique=True,
16581698
licensing=licensing,
16591699
))
1700+
license_keys = licensing.license_keys(expression=license_expression)
1701+
1702+
if all(
1703+
key in detected_license_keys
1704+
for key in license_keys
1705+
):
1706+
detection.license_expression = license_expression
16601707
detection.detection_log.append(DetectionRule.NOT_LICENSE_CLUES.value)
16611708
detection.identifier = detection.identifier_with_expression
16621709

tests/licensedcode/data/licenses_reference_reporting/license-reference-works-with-clues.expected.json

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,6 @@
1212
"detection_count": 1,
1313
"detection_log": []
1414
},
15-
{
16-
"identifier": "bsd_simplified-7517fbd6-3fa4-e9f9-2167-c65251d77656",
17-
"license_expression": "bsd-simplified",
18-
"detection_count": 1,
19-
"detection_log": [
20-
"license-clues",
21-
"not-license-clues-as-more-detections-present"
22-
]
23-
},
2415
{
2516
"identifier": "bzip2_libbzip_2010-7158bcb2-a4d7-9815-17d2-1b1d0a6d5de2",
2617
"license_expression": "bzip2-libbzip-2010",
@@ -1194,8 +1185,8 @@
11941185
{
11951186
"path": "python.LICENSE",
11961187
"type": "file",
1197-
"detected_license_expression": "python AND (other-copyleft AND gpl-1.0-plus) AND (python AND python-cwi) AND bzip2-libbzip-2010 AND sleepycat AND bsd-simplified AND bsd-new AND openssl-ssleay AND openssl AND ssleay-windows AND tcl",
1198-
"detected_license_expression_spdx": "Python-2.0 AND (LicenseRef-scancode-other-copyleft AND GPL-1.0-or-later) AND (Python-2.0 AND LicenseRef-scancode-python-cwi) AND bzip2-1.0.6 AND Sleepycat AND BSD-2-Clause AND BSD-3-Clause AND OpenSSL AND LicenseRef-scancode-openssl AND LicenseRef-scancode-ssleay-windows AND TCL",
1188+
"detected_license_expression": "python AND (other-copyleft AND gpl-1.0-plus) AND (python AND python-cwi) AND bzip2-libbzip-2010 AND sleepycat AND bsd-new AND openssl-ssleay AND openssl AND ssleay-windows AND tcl",
1189+
"detected_license_expression_spdx": "Python-2.0 AND (LicenseRef-scancode-other-copyleft AND GPL-1.0-or-later) AND (Python-2.0 AND LicenseRef-scancode-python-cwi) AND bzip2-1.0.6 AND Sleepycat AND BSD-3-Clause AND OpenSSL AND LicenseRef-scancode-openssl AND LicenseRef-scancode-ssleay-windows AND TCL",
11991190
"license_detections": [
12001191
{
12011192
"license_expression": "python",
@@ -1431,29 +1422,6 @@
14311422
],
14321423
"identifier": "sleepycat-a7cd8833-ecc2-8ade-54d7-392befcce801"
14331424
},
1434-
{
1435-
"license_expression": "bsd-simplified",
1436-
"matches": [
1437-
{
1438-
"score": 33.71,
1439-
"start_line": 358,
1440-
"end_line": 363,
1441-
"matched_length": 59,
1442-
"match_coverage": 33.71,
1443-
"matcher": "3-seq",
1444-
"license_expression": "bsd-simplified",
1445-
"rule_identifier": "bsd-simplified_242.RULE",
1446-
"rule_relevance": 100,
1447-
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-simplified_242.RULE",
1448-
"matched_text": "INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n * THE POSSIBILITY OF SUCH DAMAGE."
1449-
}
1450-
],
1451-
"detection_log": [
1452-
"license-clues",
1453-
"not-license-clues-as-more-detections-present"
1454-
],
1455-
"identifier": "bsd_simplified-7517fbd6-3fa4-e9f9-2167-c65251d77656"
1456-
},
14571425
{
14581426
"license_expression": "bsd-new",
14591427
"matches": [
@@ -1653,7 +1621,21 @@
16531621
"identifier": "tcl-75d8de8c-9cf0-d604-4b99-e03436ebfcd3"
16541622
}
16551623
],
1656-
"license_clues": [],
1624+
"license_clues": [
1625+
{
1626+
"score": 33.71,
1627+
"start_line": 358,
1628+
"end_line": 363,
1629+
"matched_length": 59,
1630+
"match_coverage": 33.71,
1631+
"matcher": "3-seq",
1632+
"license_expression": "bsd-simplified",
1633+
"rule_identifier": "bsd-simplified_242.RULE",
1634+
"rule_relevance": 100,
1635+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-simplified_242.RULE",
1636+
"matched_text": "INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n * THE POSSIBILITY OF SUCH DAMAGE."
1637+
}
1638+
],
16571639
"percentage_of_license_text": 83.64,
16581640
"scan_errors": []
16591641
}

tests/packagedcode/data/debian/copyright/debian-2019-11-15/main/m/mariadb-10.3/stable_copyright-detailed.expected.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ other_license_detections:
545545
2. Redistributions in binary form must the following disclaimer in
546546
the documentation and/or other materials provided with the
547547
distribution.
548-
identifier:
548+
identifier: bsd_new-7c8321ea-5f82-974c-692b-936bcaabf520
549549
- license_expression: bsd-new
550550
matches:
551551
- score: '100.0'

tests/packagedcode/data/debian/copyright/debian-2019-11-15/main/p/perl/copyright-detailed.expected.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3337,7 +3337,7 @@ other_license_detections:
33373337
matched_text: |
33383338
license notice, but it is assumed that it
33393339
is licensed under the same terms as
3340-
identifier:
3340+
identifier: artistic_perl_1_0_or_gpl_1_0_plus-f8a67153-d3ca-59da-be8b-68b1535b0862
33413341
- license_expression: artistic-perl-1.0 OR gpl-1.0-plus
33423342
matches:
33433343
- score: '100.0'

0 commit comments

Comments
 (0)