diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index 9fe4e0e0d..11b5e8850 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -50,7 +50,7 @@ class BaseBuildSpecDict(TypedDict, total=False): newline: NotRequired[str] #: The version of the programming language or runtime, e.g., '11' for JDK, '3.11' for Python. - language_version: Required[str] + language_version: Required[list[str]] #: List of release dependencies. dependencies: NotRequired[list[str]] @@ -73,6 +73,11 @@ class BaseBuildSpecDict(TypedDict, total=False): #: Entry point script, class, or binary for running the project. entry_point: NotRequired[str | None] + #: A "back end" is tool that a "front end" (such as pip/build) would call to + #: package the source distribution into the wheel format. build_backends would + #: be a list of these that were used in building the wheel alongside their version. + build_backends: NotRequired[dict[str, str]] + class BaseBuildSpec(ABC): """Abstract base class for build specification behavior and field resolution.""" diff --git a/src/macaron/build_spec_generator/common_spec/core.py b/src/macaron/build_spec_generator/common_spec/core.py index e7f8610e3..d13ff3d2c 100644 --- a/src/macaron/build_spec_generator/common_spec/core.py +++ b/src/macaron/build_spec_generator/common_spec/core.py @@ -442,7 +442,7 @@ def gen_generic_build_spec( "git_repo": latest_component_repository.remote_path, "git_tag": latest_component_repository.commit_sha, "newline": "lf", - "language_version": lang_version or "", + "language_version": [lang_version] if lang_version else [], "ecosystem": purl.type, "purl": str(purl), "language": target_language, diff --git a/src/macaron/build_spec_generator/common_spec/maven_spec.py b/src/macaron/build_spec_generator/common_spec/maven_spec.py index f08602f28..ddfe96b71 100644 --- a/src/macaron/build_spec_generator/common_spec/maven_spec.py +++ b/src/macaron/build_spec_generator/common_spec/maven_spec.py @@ -58,12 +58,14 @@ def resolve_fields(self, purl: PackageURL) -> None: jdk_from_jar or "Cannot find any.", ) + existing = self.data["language_version"][0] if self.data["language_version"] else None + # Select JDK from jar or another source, with a default of version 8. - selected_jdk_version = jdk_from_jar or self.data["language_version"] if self.data["language_version"] else "8" + selected_jdk_version = jdk_from_jar or existing if existing else "8" major_jdk_version = normalize_jdk_version(selected_jdk_version) if not major_jdk_version: logger.error("Failed to obtain the major version of %s", selected_jdk_version) return - self.data["language_version"] = major_jdk_version + self.data["language_version"] = [major_jdk_version] diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 94489883c..9c95e3dab 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -3,10 +3,22 @@ """This module includes build specification and helper classes for PyPI packages.""" +import logging +import os +import re +import tomli from packageurl import PackageURL +from packaging.requirements import InvalidRequirement, Requirement +from packaging.utils import InvalidWheelFilename, parse_wheel_filename from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict +from macaron.config.defaults import defaults +from macaron.errors import SourceCodeError +from macaron.slsa_analyzer.package_registry import pypi_registry +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo + +logger: logging.Logger = logging.getLogger(__name__) class PyPIBuildSpec( @@ -34,3 +46,161 @@ def resolve_fields(self, purl: PackageURL) -> None: purl: str The target software component Package URL. """ + if purl.type != "pypi": + return + + registry = pypi_registry.PyPIRegistry() + registry.load_defaults() + + registry_info = PackageRegistryInfo( + build_tool_name="pip", + build_tool_purl_type="pypi", + package_registry=registry, + metadata=[], + ) + + pypi_package_json = pypi_registry.find_or_create_pypi_asset(purl.name, purl.version, registry_info) + + if pypi_package_json is not None: + if pypi_package_json.package_json or pypi_package_json.download(dest=""): + requires_array: list[str] = [] + build_backends: dict[str, str] = {} + python_version_list: list[str] = [] + try: + with pypi_package_json.wheel(): + logger.debug("Wheel at %s", pypi_package_json.wheel_path) + # Should only have .dist-info directory + logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) + wheel_contents, metadata_contents = self.read_directory(pypi_package_json.wheel_path, purl) + generator, version = self.read_generator_line(wheel_contents) + if generator != "": + build_backends[generator] = "==" + version + if generator != "setuptools": + # Apply METADATA heuristics to determine setuptools version + if "License-File" in metadata_contents: + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "setuptools_version_emitting_license" + ) + elif "Platform: UNKNOWN" in metadata_contents: + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "setuptools_version_emitting_platform_unknown" + ) + else: + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "default_setuptools" + ) + except SourceCodeError: + logger.debug("Could not find pure wheel matching this PURL") + + logger.debug("From .dist_info:") + logger.debug(build_backends) + + try: + with pypi_package_json.sourcecode(): + try: + pyproject_content = pypi_package_json.get_sourcecode_file_contents("pyproject.toml") + content = tomli.loads(pyproject_content.decode("utf-8")) + build_system: dict[str, list[str]] = content.get("build-system", {}) + requires_array = build_system.get("requires", []) + python_version_constraint = content.get("project", {}).get("requires-python") + if python_version_constraint: + python_version_list.append(python_version_constraint) + logger.debug("From pyproject.toml:") + logger.debug(requires_array) + except SourceCodeError: + logger.debug("No pyproject.toml found") + except SourceCodeError: + logger.debug("No source distribution found") + + # Merge in pyproject.toml information only when the wheel dist_info does not contain the same + # Hatch is an interesting example of this merge being required. + for requirement in requires_array: + try: + parsed_requirement = Requirement(requirement) + if parsed_requirement.name not in build_backends: + build_backends[parsed_requirement.name] = str(parsed_requirement.specifier) + except InvalidRequirement: + logger.debug("Malformed requirement encountered:") + logger.debug(requirement) + + logger.debug("Combined:") + logger.debug(build_backends) + self.data["build_backends"] = build_backends + + if not python_version_list: + try: + # Get python version specified in the wheel file name + logger.debug(pypi_package_json.wheel_filename) + _, _, _, tags = parse_wheel_filename(pypi_package_json.wheel_filename) + for tag in tags: + python_version_list.append(tag.interpreter) + logger.debug(python_version_list) + except InvalidWheelFilename: + logger.debug("Could not parse wheel file name to extract version") + + self.data["language_version"] = python_version_list + + def read_directory(self, wheel_path: str, purl: PackageURL) -> tuple[str, str]: + """ + Read in the WHEEL and METADATA file from the .dist_info directory. + + Parameters + ---------- + wheel_path : str + Path to the temporary directory where the wheel was + downloaded into. + purl: PackageURL + PURL corresponding to the package being analyzed. + + Returns + ------- + tuple[str, str] + Tuple where the first element is a string of the .dist-info/WHEEL + contents and the second element is a string of the .dist-info/METADATA + contents + """ + # From https://peps.python.org/pep-0427/#escaping-and-unicode + normalized_name = re.sub(r"[^\w\d.]+", "_", purl.name, re.UNICODE) + dist_info = f"{normalized_name}-{purl.version}.dist-info" + logger.debug(dist_info) + + dist_info_path = os.path.join(wheel_path, dist_info) + + if not os.path.isdir(dist_info_path): + return "", "" + + wheel_path = os.path.join(dist_info_path, "WHEEL") + metadata_path = os.path.join(dist_info_path, "METADATA") + + wheel_contents = "" + metadata_contents = "" + + if os.path.exists(wheel_path): + with open(wheel_path, encoding="utf-8") as wheel_file: + wheel_contents = wheel_file.read() + if os.path.exists(metadata_path): + with open(metadata_path, encoding="utf-8") as metadata_file: + metadata_contents = metadata_file.read() + + return wheel_contents, metadata_contents + + def read_generator_line(self, wheel_contents: str) -> tuple[str, str]: + """ + Parse through the "Generator: {build backend} {version}" line of .dist_info/WHEEL. + + Parameters + ---------- + wheel_contents : str + String of the contents of the .dist_info/WHEEL file + + Returns + ------- + tuple[str, str] + Tuple where the first element is the generating build backend and + the second element is its version. + """ + for line in wheel_contents.splitlines(): + if line.startswith("Generator:"): + split_line = line.split(" ") + return split_line[1], split_line[2] + return "", "" diff --git a/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py b/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py index df9a7b099..ba0b61426 100644 --- a/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py +++ b/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py @@ -95,7 +95,7 @@ def gen_reproducible_central_build_spec(build_spec: BaseBuildSpecDict) -> str | "tool": ReproducibleCentralBuildTool[build_spec["build_tool"].upper()].value, "newline": build_spec["newline"], "buildinfo": f"target/{build_spec['artifact_id']}-{build_spec['version']}.buildinfo", - "jdk": build_spec["language_version"], + "jdk": build_spec["language_version"][0], "command": compose_shell_commands(build_spec["build_commands"]), } diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 892da7b50..c036598ca 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -644,3 +644,9 @@ custom_semgrep_rules_path = # .yaml prefix. Note, this will be ignored if a path to custom semgrep rules is not provided. This list may not contain # duplicated elements, meaning that ruleset names must be unique. disabled_custom_rulesets = +# As per https://peps.python.org/pep-0639/appendix-examples/, presumably most versions < 59.1.1 will work here +setuptools_version_emitting_license = 56.2.0 +# TODO: Investigate if other versions would be suitable +setuptools_version_emitting_platform_unknown = 57.5.0 +# TODO: Investigate if other versions would be suitable +default_setuptools = 67.7.2 diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index c0c273154..7adcede4a 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -58,7 +58,7 @@ def find_repo( pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) if not pypi_registry: return "", RepoFinderInfo.PYPI_NO_REGISTRY - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "") + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "", "", "") if not pypi_asset: # This should be unreachable, as the pypi_registry has already been confirmed to be of type PyPIRegistry. diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index becf815de..0f61e4037 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -12,6 +12,7 @@ import tarfile import tempfile import urllib.parse +import zipfile from collections.abc import Callable, Generator, Iterator from contextlib import contextmanager from dataclasses import dataclass @@ -283,6 +284,67 @@ def download_package_sourcecode(self, url: str) -> str: logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) return temp_dir + def download_package_wheel(self, url: str) -> str: + """Download the wheel at input url. + + Parameters + ---------- + url: str + The wheel's url. + + Returns + ------- + str + The temp directory storing {distribution}-{version}.dist-info/WHEEL and + {distribution}-{version}.dist-info/METADATA. + + Raises + ------ + InvalidHTTPResponseError + If the HTTP request to the registry fails or an unexpected response is returned. + """ + # Get name of file. + _, _, file_name = url.rpartition("/") + # Remove the .whl to get wheel name + wheel_name = re.sub(r"\.whl$", "", file_name) + # Makes a directory in the OS's temp folder + temp_dir = tempfile.mkdtemp(prefix=f"{wheel_name}_") + # get temp_dir/file_name + wheel_file = os.path.join(temp_dir, file_name) + # Same timeout and size limit as in download_package_sourcecode + timeout = defaults.getint("downloads", "timeout", fallback=120) + size_limit = defaults.getint("downloads", "max_download_size", fallback=10000000) + + if not download_file_with_size_limit(url, {}, wheel_file, timeout, size_limit): + self.cleanup_sourcecode_directory(temp_dir, "Could not download the file.") + + # Wheel is a zip + if not zipfile.is_zipfile(wheel_file): + self.cleanup_sourcecode_directory(temp_dir, f"Unable to extract source code from file {file_name}") + + try: + # For consumer pattern + with zipfile.ZipFile(wheel_file) as zip_file: + members = [] + for member in zip_file.infolist(): + if member.filename.endswith("WHEEL"): + members.append(member) + if member.filename.endswith("METADATA"): + members.append(member) + # Intended suppression. The tool is unable to see that .extractall is being called with a filter + zip_file.extractall(temp_dir, members) # nosec B202:tarfile_unsafe_members + except zipfile.BadZipFile as bad_zip: + self.cleanup_sourcecode_directory(temp_dir, f"Error extracting wheel: {bad_zip}", bad_zip) + + # Now we should have it like: + # temp_dir/wheel_name.whl + # temp_dir/wheel_name.dist-info/ + + os.remove(wheel_file) + + logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) + return temp_dir + def get_artifact_hash(self, artifact_url: str) -> str | None: """Return the hash of the artifact found at the passed URL. @@ -496,6 +558,12 @@ class PyPIPackageJsonAsset: #: the source code temporary location name package_sourcecode_path: str + #: the wheel temporary location name + wheel_path: str + + #: name of the wheel file + wheel_filename: str + #: The size of the asset (in bytes). This attribute is added to match the AssetLocator #: protocol and is not used because pypi API registry does not provide it. @property @@ -615,6 +683,55 @@ def get_sourcecode_url(self, package_type: str = "sdist") -> str | None: return configured_source_url return None + def get_wheel_url(self, tag: str = "none-any") -> str | None: + """Get url of wheel corresponding to specified tag. + + Parameters + ---------- + tag: str + Wheel tag to match. Defaults to none-any. + + Returns + ------- + str | None + URL of the wheel. + """ + if self.component_version: + urls = json_extract(self.package_json, ["releases", self.component_version], list) + else: + # Get the latest version. + urls = json_extract(self.package_json, ["urls"], list) + if not urls: + return None + for distribution in urls: + # Only examine wheels + if distribution.get("packagetype") != "bdist_wheel": + continue + file_name: str = distribution.get("filename") or "" + if not file_name.endswith(f"{tag}.whl"): + continue + self.wheel_filename = file_name + # Continue to getting url + wheel_url: str = distribution.get("url") or "" + if wheel_url: + try: + parsed_url = urllib.parse.urlparse(wheel_url) + except ValueError: + logger.debug("Error occurred while processing the wheel URL %s.", wheel_url) + return None + if self.pypi_registry.fileserver_url_netloc and self.pypi_registry.fileserver_url_scheme: + configured_wheel_url = urllib.parse.ParseResult( + scheme=self.pypi_registry.fileserver_url_scheme, + netloc=self.pypi_registry.fileserver_url_netloc, + path=parsed_url.path, + params="", + query="", + fragment="", + ).geturl() + logger.debug("Found wheel URL: %s", configured_wheel_url) + return configured_wheel_url + return None + def get_latest_release_upload_time(self) -> str | None: """Get upload time of the latest release. @@ -629,6 +746,33 @@ def get_latest_release_upload_time(self) -> str | None: return upload_time return None + @contextmanager + def wheel(self) -> Generator[None]: + """Download and cleanup wheel of the package with a context manager.""" + if not self.download_wheel(): + raise SourceCodeError("Unable to download requested wheel.") + yield + if self.wheel_path: + # Name for cleanup_sourcecode_directory could be refactored here + PyPIRegistry.cleanup_sourcecode_directory(self.wheel_path) + + def download_wheel(self) -> bool: + """Download and extract wheel metadata to a temporary directory. + + Returns + ------- + bool + ``True`` if the wheel is downloaded and extracted successfully; ``False`` if not. + """ + url = self.get_wheel_url() + if url: + try: + self.wheel_path = self.pypi_registry.download_package_wheel(url) + return True + except InvalidHTTPResponseError as error: + logger.debug(error) + return False + @contextmanager def sourcecode(self) -> Generator[None]: """Download and cleanup source code of the package with a context manager.""" @@ -799,6 +943,6 @@ def find_or_create_pypi_asset( logger.debug("Failed to create PyPIPackageJson asset.") return None - asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "") + asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "", "", "") pypi_registry_info.metadata.append(asset) return asset diff --git a/tests/build_spec_generator/reproducible_central/test_reproducible_central.py b/tests/build_spec_generator/reproducible_central/test_reproducible_central.py index 5b7cbf664..39b8ef5ed 100644 --- a/tests/build_spec_generator/reproducible_central/test_reproducible_central.py +++ b/tests/build_spec_generator/reproducible_central/test_reproducible_central.py @@ -26,7 +26,7 @@ def fixture_base_build_spec() -> BaseBuildSpecDict: "git_tag": "sampletag", "build_tool": "maven", "newline": "lf", - "language_version": "17", + "language_version": ["17"], "build_commands": [["mvn", "package"]], "purl": "pkg:maven/com.oracle/example-artifact@1.2.3", } diff --git a/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec b/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec index 5a05b20d7..1bfeba572 100644 --- a/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec +++ b/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec @@ -1 +1 @@ -{"macaron_version": "0.18.0", "group_id": "org.apache.hugegraph", "artifact_id": "computer-k8s", "version": "1.0.0", "git_repo": "https://github.com/apache/hugegraph-computer", "git_tag": "d2b95262091d6572cc12dcda57d89f9cd44ac88b", "newline": "lf", "language_version": "11", "ecosystem": "maven", "purl": "pkg:maven/org.apache.hugegraph/computer-k8s@1.0.0", "language": "java", "build_tool": "maven", "build_commands": [["mvn", "-DskipTests=true", "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", "clean", "package"]]} +{"macaron_version": "0.18.0", "group_id": "org.apache.hugegraph", "artifact_id": "computer-k8s", "version": "1.0.0", "git_repo": "https://github.com/apache/hugegraph-computer", "git_tag": "d2b95262091d6572cc12dcda57d89f9cd44ac88b", "newline": "lf", "language_version": ["11"], "ecosystem": "maven", "purl": "pkg:maven/org.apache.hugegraph/computer-k8s@1.0.0", "language": "java", "build_tool": "maven", "build_commands": [["mvn", "-DskipTests=true", "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", "clean", "package"]]} diff --git a/tests/integration/cases/pypi_cachetools/expected_default.buildspec b/tests/integration/cases/pypi_cachetools/expected_default.buildspec new file mode 100644 index 000000000..469c99c9d --- /dev/null +++ b/tests/integration/cases/pypi_cachetools/expected_default.buildspec @@ -0,0 +1 @@ +{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "cachetools", "version": "6.2.1", "git_repo": "https://github.com/tkem/cachetools", "git_tag": "ca7508fd56103a1b6d6f17c8e93e36c60b44ca25", "newline": "lf", "language_version": ["py3"], "ecosystem": "pypi", "purl": "pkg:pypi/cachetools@6.2.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "coverage", "tox"]], "build_backends": {"setuptools": "==(80.9.0)", "wheel": ""}} diff --git a/tests/integration/cases/pypi_cachetools/test.yaml b/tests/integration/cases/pypi_cachetools/test.yaml new file mode 100644 index 000000000..c2d0575b1 --- /dev/null +++ b/tests/integration/cases/pypi_cachetools/test.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a PyPI PURL that has provenance available on the PyPI registry, and passes the SCM authenticity check. + It also tests buildspec generation. + +tags: +- macaron-python-package +- tutorial + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/cachetools@6.2.1 +- name: Generate the buildspec + kind: gen-build-spec + options: + command_args: + - -purl + - pkg:pypi/cachetools@6.2.1 + - --output-format + - default-buildspec +- name: Compare Buildspec. + kind: compare + options: + kind: default_build_spec + result: output/buildspec/pypi/cachetools/macaron.buildspec + expected: expected_default.buildspec diff --git a/tests/integration/cases/pypi_toga/expected_default.buildspec b/tests/integration/cases/pypi_toga/expected_default.buildspec index ab168ac03..d5335b3e5 100644 --- a/tests/integration/cases/pypi_toga/expected_default.buildspec +++ b/tests/integration/cases/pypi_toga/expected_default.buildspec @@ -1 +1 @@ -{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "toga", "version": "0.5.1", "git_repo": "https://github.com/beeware/toga", "git_tag": "ef1912b0a1b5c07793f9aa372409f5b9d36f2604", "newline": "lf", "language_version": "", "ecosystem": "pypi", "purl": "pkg:pypi/toga@0.5.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "-U", "pip"]]} +{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "toga", "version": "0.5.1", "git_repo": "https://github.com/beeware/toga", "git_tag": "ef1912b0a1b5c07793f9aa372409f5b9d36f2604", "newline": "lf", "language_version": [">= 3.9"], "ecosystem": "pypi", "purl": "pkg:pypi/toga@0.5.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "-U", "pip"]], "build_backends": {"setuptools": "==(80.3.1)", "setuptools_scm": "==8.3.1", "setuptools_dynamic_dependencies": "==1.0.0"}}