Skip to content

Commit 74bc574

Browse files
Fix license detection bugs
* Top level packages were inconsistant because of license plugin post processing running after the package plugin post processing. * Adjust license clues/false-postives/bad matches heuristics. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 79871d3 commit 74bc574

File tree

9 files changed

+286
-652
lines changed

9 files changed

+286
-652
lines changed

etc/scripts/licenses/buildrules-template.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ is_license_clue: yes
1212
referenced_filenames:
1313
notes:
1414
---
15-
15+
The only changes we've made are to
16+
accommodate the licenses we use, which are GPLv3 and LGPLv3 (or later)
17+
whereas the Linux kernel uses GPLv2.
1618
----------------------------------------
1719
license_expression:
1820
relevance: 100

src/licensedcode/detection.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from commoncode.text import python_safe_name
2626
from licensedcode.cache import get_index
2727
from licensedcode.cache import get_cache
28+
from licensedcode.cache import build_spdx_license_expression
2829
from licensedcode.match import LicenseMatch
2930
from licensedcode.match import set_matched_lines
3031
from licensedcode.models import UnDetectedRule
@@ -105,6 +106,7 @@ class DetectionCategory(Enum):
105106
EXTRA_WORDS = 'extra-words'
106107
UNKNOWN_MATCH = 'unknown-match'
107108
LICENSE_CLUES = 'license-clues'
109+
LOW_QUALITY_MATCHES = 'license-clues'
108110
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
109111
FALSE_POSITVE = 'possible-false-positive'
110112
UNDETECTED_LICENSE = 'undetected-license'
@@ -730,6 +732,10 @@ def collect_license_detections(codebase, include_license_clues=True):
730732
)
731733
if not detection_is_same:
732734
package["declared_license_expression"] = license_expression
735+
package["declared_license_expression_spdx"] = str(build_spdx_license_expression(
736+
license_expression=license_expression,
737+
licensing=get_cache().licensing,
738+
))
733739
modified = True
734740

735741
other_license_detections = package["other_license_detections"]
@@ -741,6 +747,10 @@ def collect_license_detections(codebase, include_license_clues=True):
741747
)
742748
if not detection_is_same:
743749
package["other_license_expression"] = license_expression
750+
package["other_license_expression_spdx"] = str(build_spdx_license_expression(
751+
license_expression=license_expression,
752+
licensing=get_cache().licensing,
753+
))
744754
modified = True
745755

746756
if modified:
@@ -753,6 +763,30 @@ def collect_license_detections(codebase, include_license_clues=True):
753763
)
754764
all_license_detections.extend(package_license_detection_objects)
755765

766+
if has_packages and has_licenses:
767+
for package in getattr(codebase.attributes, 'packages', []):
768+
license_expression_package = package["declared_license_expression"]
769+
if not license_expression_package:
770+
continue
771+
772+
resource_paths = package["datafile_paths"]
773+
if len(resource_paths) == 1:
774+
resource_path = resource_paths[0]
775+
else:
776+
#TODO: implement the correct consistency check
777+
# based on which datafile path the license came from
778+
resource_path = resource_paths[0]
779+
resource = codebase.get_resource(path=resource_path)
780+
resource_packages = getattr(resource, 'package_data', None)
781+
if not resource_packages or len(resource_packages) > 1:
782+
continue
783+
784+
resource_package = resource_packages[0]
785+
if license_expression_package != resource_package["declared_license_expression"]:
786+
package["license_detections"] = resource_package["license_detections"]
787+
package["declared_license_expression"] = resource_package["declared_license_expression"]
788+
package["declared_license_expression_spdx"] = resource_package["declared_license_expression_spdx"]
789+
756790
return all_license_detections
757791

758792

@@ -1107,7 +1141,7 @@ def has_correct_license_clue_matches(license_matches):
11071141
return is_correct_detection(license_matches) and all(match.rule.is_license_clue for match in license_matches)
11081142

11091143

1110-
def is_license_clues(license_matches):
1144+
def is_low_quality_matches(license_matches):
11111145
"""
11121146
Return True if the license_matches are not part of a correct
11131147
license detection and are mere license clues.
@@ -1329,6 +1363,14 @@ def get_detected_license_expression(
13291363
detection_log.append(DetectionRule.LICENSE_CLUES.value)
13301364
return detection_log, combined_expression
13311365

1366+
elif analysis == DetectionCategory.LOW_QUALITY_MATCHES.value:
1367+
if TRACE_ANALYSIS:
1368+
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
1369+
# TODO: we are temporarily returning these as license clues, and not
1370+
# in detections but ideally we should return synthetic unknowns for these
1371+
detection_log.append(DetectionRule.LOW_QUALITY_MATCHES.value)
1372+
return detection_log, combined_expression
1373+
13321374
else:
13331375
if TRACE_ANALYSIS:
13341376
logger_debug(f'analysis not-combined')
@@ -1510,8 +1552,8 @@ def analyze_detection(license_matches, package_license=False):
15101552
elif has_unknown_matches(license_matches=license_matches):
15111553
return DetectionCategory.UNKNOWN_MATCH.value
15121554

1513-
elif is_license_clues(license_matches=license_matches):
1514-
return DetectionCategory.LICENSE_CLUES.value
1555+
elif not package_license and is_low_quality_matches(license_matches=license_matches):
1556+
return DetectionCategory.LOW_QUALITY_MATCHES.value
15151557

15161558
# Case where at least one of the matches have `match_coverage`
15171559
# below IMPERFECT_MATCH_COVERAGE_THR
@@ -1644,19 +1686,24 @@ def process_detections(detections, licensing=Licensing()):
16441686

16451687
for detection in detections:
16461688
if detection.license_expression == None:
1647-
license_keys = licensing.license_keys(expression=detection.license_expression)
1648-
if all(
1649-
key in detected_license_keys
1650-
for key in license_keys
1651-
):
1652-
detection.license_expression = str(combine_expressions(
1689+
if has_correct_license_clue_matches(detection.matches):
1690+
continue
1691+
1692+
license_expression = str(combine_expressions(
16531693
expressions=[
16541694
match.rule.license_expression
16551695
for match in detection.matches
16561696
],
16571697
unique=True,
16581698
licensing=licensing,
16591699
))
1700+
license_keys = licensing.license_keys(expression=license_expression)
1701+
1702+
if all(
1703+
key in detected_license_keys
1704+
for key in license_keys
1705+
):
1706+
detection.license_expression = license_expression
16601707
detection.detection_log.append(DetectionRule.NOT_LICENSE_CLUES.value)
16611708
detection.identifier = detection.identifier_with_expression
16621709

tests/formattedcode/data/yaml/package-and-licenses-expected.yaml

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,21 +90,21 @@ packages:
9090
- license_expression: apache-2.0
9191
matches:
9292
- score: '100.0'
93-
start_line: 4
94-
end_line: 4
95-
matched_length: 4
93+
start_line: 1
94+
end_line: 1
95+
matched_length: 3
9696
match_coverage: '100.0'
97-
matcher: 2-aho
97+
matcher: 1-hash
9898
license_expression: apache-2.0
99-
rule_identifier: apache-2.0_65.RULE
99+
rule_identifier: spdx_license_id_apache-2.0_for_apache-2.0.RULE
100100
rule_relevance: 100
101-
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-2.0_65.RULE
102-
matched_text: license = Apache-2.0
103-
identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
101+
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE
102+
matched_text: Apache-2.0
103+
identifier: apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8
104104
other_license_expression:
105105
other_license_expression_spdx:
106106
other_license_detections: []
107-
extracted_license_statement:
107+
extracted_license_statement: Apache-2.0
108108
notice_text:
109109
source_packages: []
110110
extra_data: {}
@@ -119,12 +119,15 @@ packages:
119119
purl: pkg:pypi/codebase
120120
dependencies: []
121121
license_detections:
122-
- identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
123-
license_expression: apache-2.0
124-
detection_count: 2
125122
- identifier: apache_2_0-ab23f79b-ec38-9a8a-9b23-85059407f34d
126123
license_expression: apache-2.0
127124
detection_count: 1
125+
- identifier: apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8
126+
license_expression: apache-2.0
127+
detection_count: 1
128+
- identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
129+
license_expression: apache-2.0
130+
detection_count: 1
128131
- identifier: apache_2_0_and__apache_2_0_or_mit-9b638e72-e872-a67f-3447-eec297ef7b39
129132
license_expression: apache-2.0 AND (apache-2.0 OR mit)
130133
detection_count: 1
@@ -706,6 +709,31 @@ license_rule_references:
706709
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
707710
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
708711
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
712+
- license_expression: apache-2.0
713+
identifier: spdx_license_id_apache-2.0_for_apache-2.0.RULE
714+
language: en
715+
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE
716+
is_license_text: no
717+
is_license_notice: no
718+
is_license_reference: yes
719+
is_license_tag: no
720+
is_license_intro: no
721+
is_license_clue: no
722+
is_continuous: no
723+
is_builtin: yes
724+
is_from_license: no
725+
is_synthetic: no
726+
length: 3
727+
relevance: 100
728+
minimum_coverage: 100
729+
referenced_filenames: []
730+
notes: Used to detect a bare SPDX license id
731+
ignorable_copyrights: []
732+
ignorable_holders: []
733+
ignorable_authors: []
734+
ignorable_urls: []
735+
ignorable_emails: []
736+
text: apache-2.0
709737
files:
710738
- path: package-and-licenses
711739
type: directory
@@ -1148,21 +1176,21 @@ files:
11481176
- license_expression: apache-2.0
11491177
matches:
11501178
- score: '100.0'
1151-
start_line: 4
1152-
end_line: 4
1153-
matched_length: 4
1179+
start_line: 1
1180+
end_line: 1
1181+
matched_length: 3
11541182
match_coverage: '100.0'
1155-
matcher: 2-aho
1183+
matcher: 1-hash
11561184
license_expression: apache-2.0
1157-
rule_identifier: apache-2.0_65.RULE
1185+
rule_identifier: spdx_license_id_apache-2.0_for_apache-2.0.RULE
11581186
rule_relevance: 100
1159-
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-2.0_65.RULE
1160-
matched_text: license = Apache-2.0
1161-
identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
1187+
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE
1188+
matched_text: Apache-2.0
1189+
identifier: apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8
11621190
other_license_expression:
11631191
other_license_expression_spdx:
11641192
other_license_detections: []
1165-
extracted_license_statement:
1193+
extracted_license_statement: Apache-2.0
11661194
notice_text:
11671195
source_packages: []
11681196
file_references: []

tests/licensedcode/data/licenses_reference_reporting/license-reference-works-with-clues.expected.json

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,6 @@
1212
"detection_count": 1,
1313
"detection_log": []
1414
},
15-
{
16-
"identifier": "bsd_simplified-7517fbd6-3fa4-e9f9-2167-c65251d77656",
17-
"license_expression": "bsd-simplified",
18-
"detection_count": 1,
19-
"detection_log": [
20-
"license-clues",
21-
"not-license-clues-as-more-detections-present"
22-
]
23-
},
2415
{
2516
"identifier": "bzip2_libbzip_2010-7158bcb2-a4d7-9815-17d2-1b1d0a6d5de2",
2617
"license_expression": "bzip2-libbzip-2010",
@@ -1194,8 +1185,8 @@
11941185
{
11951186
"path": "python.LICENSE",
11961187
"type": "file",
1197-
"detected_license_expression": "python AND (other-copyleft AND gpl-1.0-plus) AND (python AND python-cwi) AND bzip2-libbzip-2010 AND sleepycat AND bsd-simplified AND bsd-new AND openssl-ssleay AND openssl AND ssleay-windows AND tcl",
1198-
"detected_license_expression_spdx": "Python-2.0 AND (LicenseRef-scancode-other-copyleft AND GPL-1.0-or-later) AND (Python-2.0 AND LicenseRef-scancode-python-cwi) AND bzip2-1.0.6 AND Sleepycat AND BSD-2-Clause AND BSD-3-Clause AND OpenSSL AND LicenseRef-scancode-openssl AND LicenseRef-scancode-ssleay-windows AND TCL",
1188+
"detected_license_expression": "python AND (other-copyleft AND gpl-1.0-plus) AND (python AND python-cwi) AND bzip2-libbzip-2010 AND sleepycat AND bsd-new AND openssl-ssleay AND openssl AND ssleay-windows AND tcl",
1189+
"detected_license_expression_spdx": "Python-2.0 AND (LicenseRef-scancode-other-copyleft AND GPL-1.0-or-later) AND (Python-2.0 AND LicenseRef-scancode-python-cwi) AND bzip2-1.0.6 AND Sleepycat AND BSD-3-Clause AND OpenSSL AND LicenseRef-scancode-openssl AND LicenseRef-scancode-ssleay-windows AND TCL",
11991190
"license_detections": [
12001191
{
12011192
"license_expression": "python",
@@ -1431,29 +1422,6 @@
14311422
],
14321423
"identifier": "sleepycat-a7cd8833-ecc2-8ade-54d7-392befcce801"
14331424
},
1434-
{
1435-
"license_expression": "bsd-simplified",
1436-
"matches": [
1437-
{
1438-
"score": 33.71,
1439-
"start_line": 358,
1440-
"end_line": 363,
1441-
"matched_length": 59,
1442-
"match_coverage": 33.71,
1443-
"matcher": "3-seq",
1444-
"license_expression": "bsd-simplified",
1445-
"rule_identifier": "bsd-simplified_242.RULE",
1446-
"rule_relevance": 100,
1447-
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-simplified_242.RULE",
1448-
"matched_text": "INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n * THE POSSIBILITY OF SUCH DAMAGE."
1449-
}
1450-
],
1451-
"detection_log": [
1452-
"license-clues",
1453-
"not-license-clues-as-more-detections-present"
1454-
],
1455-
"identifier": "bsd_simplified-7517fbd6-3fa4-e9f9-2167-c65251d77656"
1456-
},
14571425
{
14581426
"license_expression": "bsd-new",
14591427
"matches": [
@@ -1653,7 +1621,21 @@
16531621
"identifier": "tcl-75d8de8c-9cf0-d604-4b99-e03436ebfcd3"
16541622
}
16551623
],
1656-
"license_clues": [],
1624+
"license_clues": [
1625+
{
1626+
"score": 33.71,
1627+
"start_line": 358,
1628+
"end_line": 363,
1629+
"matched_length": 59,
1630+
"match_coverage": 33.71,
1631+
"matcher": "3-seq",
1632+
"license_expression": "bsd-simplified",
1633+
"rule_identifier": "bsd-simplified_242.RULE",
1634+
"rule_relevance": 100,
1635+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-simplified_242.RULE",
1636+
"matched_text": "INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n * THE POSSIBILITY OF SUCH DAMAGE."
1637+
}
1638+
],
16571639
"percentage_of_license_text": 83.64,
16581640
"scan_errors": []
16591641
}

tests/packagedcode/data/debian/copyright/debian-2019-11-15/main/m/mariadb-10.3/stable_copyright-detailed.expected.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ other_license_detections:
545545
2. Redistributions in binary form must the following disclaimer in
546546
the documentation and/or other materials provided with the
547547
distribution.
548-
identifier:
548+
identifier: bsd_new-7c8321ea-5f82-974c-692b-936bcaabf520
549549
- license_expression: bsd-new
550550
matches:
551551
- score: '100.0'

tests/packagedcode/data/debian/copyright/debian-2019-11-15/main/p/perl/copyright-detailed.expected.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3337,7 +3337,7 @@ other_license_detections:
33373337
matched_text: |
33383338
license notice, but it is assumed that it
33393339
is licensed under the same terms as
3340-
identifier:
3340+
identifier: artistic_perl_1_0_or_gpl_1_0_plus-f8a67153-d3ca-59da-be8b-68b1535b0862
33413341
- license_expression: artistic-perl-1.0 OR gpl-1.0-plus
33423342
matches:
33433343
- score: '100.0'

0 commit comments

Comments
 (0)