Implement provider for Chainguard Linux (#132)

* Implement provider for Chainguard Linux Signed-off-by: Dan Luhring <[email protected]> * Fix tests Signed-off-by: Dan Luhring <[email protected]> * wip: configs.yaml Signed-off-by: Dan Luhring <[email protected]> * Fix failing test Signed-off-by: Dan Luhring <[email protected]> * Use main branch of Grype for quality gate Signed-off-by: Dan Luhring <[email protected]> * Use latest grype and grype-db Signed-off-by: Dan Luhring <[email protected]> --------- Signed-off-by: Dan Luhring <[email protected]>
anchore · Mar 28, 2023 · 948d8d1 · 948d8d1
1 parent dba31ab
commit 948d8d1
Show file tree

Hide file tree

Showing 16 changed files with 1,060 additions and 154 deletions.
diff --git a/Makefile b/Makefile
@@ -79,11 +79,11 @@ dev:  ## Get a development shell with locally editable grype, grype-db, and vunn
 	@DEV_VUNNEL_BIN_DIR=$(ABS_BIN_DIR) .github/scripts/dev-shell.sh $(provider) $(providers)
 
 .PHONY: build-grype
-build-grype: $(TEMP_DIR) ## Build grype for local development
+build-grype: $(BIN_DIR) ## Build grype for local development
 	@cd $(GRYPE_PATH) && go build -o $(ABS_BIN_DIR)/grype .
 
 .PHONY: build-grype-db
-build-grype-db: $(TEMP_DIR) ## Build grype-db for local development
+build-grype-db: $(BIN_DIR) ## Build grype-db for local development
 	@cd $(GRYPE_DB_PATH) && go build -o $(ABS_BIN_DIR)/grype-db ./cmd/grype-db
 
 .PHONY: update-db

diff --git a/src/vunnel/cli/config.py b/src/vunnel/cli/config.py
@@ -16,6 +16,7 @@ class Providers:
     alpine: providers.alpine.Config = field(default_factory=providers.alpine.Config)
     amazon: providers.amazon.Config = field(default_factory=providers.amazon.Config)
     centos: providers.centos.Config = field(default_factory=providers.centos.Config)
+    chainguard: providers.chainguard.Config = field(default_factory=providers.chainguard.Config)
     debian: providers.debian.Config = field(default_factory=providers.debian.Config)
     github: providers.github.Config = field(default_factory=providers.github.Config)
     nvd: providers.nvd.Config = field(default_factory=providers.nvd.Config)

diff --git a/src/vunnel/providers/__init__.py b/src/vunnel/providers/__init__.py
@@ -8,6 +8,7 @@
     alpine,
     amazon,
     centos,
+    chainguard,
     debian,
     github,
     nvd,
@@ -38,6 +39,7 @@
     sles.Provider.name(): sles.Provider,
     ubuntu.Provider.name(): ubuntu.Provider,
     wolfi.Provider.name(): wolfi.Provider,
+    chainguard.Provider.name(): chainguard.Provider,
 }
 
 

diff --git a/src/vunnel/providers/chainguard/__init__.py b/src/vunnel/providers/chainguard/__init__.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from vunnel import provider, result, schema
+from vunnel.providers.wolfi.parser import Parser
+
+if TYPE_CHECKING:
+    import datetime
+
+
+@dataclass
+class Config:
+    runtime: provider.RuntimeConfig = field(
+        default_factory=lambda: provider.RuntimeConfig(
+            result_store=result.StoreStrategy.SQLITE,
+            existing_results=provider.ResultStatePolicy.DELETE_BEFORE_WRITE,
+        ),
+    )
+    request_timeout: int = 125
+
+
+class Provider(provider.Provider):
+    _url = "https://packages.cgr.dev/chainguard/security.json"
+    _namespace = "chainguard"
+
+    def __init__(self, root: str, config: Config | None = None):
+        if not config:
+            config = Config()
+        super().__init__(root, runtime_cfg=config.runtime)
+        self.config = config
+
+        self.logger.debug(f"config: {config}")
+
+        self.schema = schema.OSSchema()
+        self.parser = Parser(
+            workspace=self.workspace,
+            url=self._url,
+            namespace=self._namespace,
+            download_timeout=self.config.request_timeout,
+            logger=self.logger,
+        )
+
+        # this provider requires the previous state from former runs
+        provider.disallow_existing_input_policy(config.runtime)
+
+    @classmethod
+    def name(cls) -> str:
+        return "chainguard"
+
+    def update(self, last_updated: datetime.datetime | None) -> tuple[list[str], int]:
+        with self.results_writer() as writer:
+            # TODO: tech debt: on subsequent runs, we should only write new vulns (this currently re-writes all)
+            for release, vuln_dict in self.parser.get():
+                for vuln_id, record in vuln_dict.items():
+                    writer.write(
+                        identifier=os.path.join(f"{self._namespace.lower()}:{release.lower()}", vuln_id),
+                        schema=self.schema,
+                        payload=record,
+                    )
+
+        return [self._url], len(writer)
diff --git a/src/vunnel/providers/wolfi/__init__.py b/src/vunnel/providers/wolfi/__init__.py
@@ -6,7 +6,7 @@
 
 from vunnel import provider, result, schema
 
-from .parser import Parser, namespace
+from .parser import Parser
 
 if TYPE_CHECKING:
     import datetime
@@ -24,6 +24,9 @@ class Config:
 
 
 class Provider(provider.Provider):
+    _url = "https://packages.wolfi.dev/os/security.json"
+    _namespace = "wolfi"
+
     def __init__(self, root: str, config: Config | None = None):
         if not config:
             config = Config()
@@ -35,6 +38,8 @@ def __init__(self, root: str, config: Config | None = None):
         self.schema = schema.OSSchema()
         self.parser = Parser(
             workspace=self.workspace,
+            url=self._url,
+            namespace=self._namespace,
             download_timeout=self.config.request_timeout,
             logger=self.logger,
         )
@@ -52,9 +57,9 @@ def update(self, last_updated: datetime.datetime | None) -> tuple[list[str], int
             for release, vuln_dict in self.parser.get():
                 for vuln_id, record in vuln_dict.items():
                     writer.write(
-                        identifier=os.path.join(f"{namespace.lower()}:{release.lower()}", vuln_id),
+                        identifier=os.path.join(f"{self._namespace.lower()}:{release.lower()}", vuln_id),
                         schema=self.schema,
                         payload=record,
                     )
 
-        return self.parser.urls, len(writer)
+        return [self._url], len(writer)
diff --git a/src/vunnel/providers/wolfi/parser.py b/src/vunnel/providers/wolfi/parser.py
@@ -1,35 +1,45 @@
 from __future__ import annotations
 
 import copy
-import glob
 import json
 import logging
 import os
 import re
+from urllib.parse import urlparse
 
 import requests
 
 from vunnel import utils
 from vunnel.utils import vulnerability
 
-namespace = "wolfi"
-
 
 class Parser:
-    _url_ = "https://packages.wolfi.dev"
+    _release_ = "rolling"
     _secdb_dir_ = "secdb"
-    _db_types = ["os"]
 
-    def __init__(self, workspace, download_timeout=125, url=None, logger=None):
+    def __init__(  # noqa: PLR0913
+        self,
+        workspace,
+        url: str,
+        namespace: str,
+        download_timeout: int = 125,
+        logger: logging.Logger = None,  # noqa: PLR0913
+    ):
         self.download_timeout = download_timeout
         self.secdb_dir_path = os.path.join(workspace.input_path, self._secdb_dir_)
         self.metadata_url = url.strip("/") if url else Parser._url_
-        self.urls = []
+        self.url = url
+        self.namespace = namespace
+        self._db_filename = self._extract_filename_from_url(url)
 
         if not logger:
             logger = logging.getLogger(self.__class__.__name__)
         self.logger = logger
 
+    @staticmethod
+    def _extract_filename_from_url(url):
+        return os.path.basename(urlparse(url).path)
+
     @utils.retry_with_backoff()
     def _download(self):
         """
@@ -39,55 +49,38 @@ def _download(self):
         if not os.path.exists(self.secdb_dir_path):
             os.makedirs(self.secdb_dir_path, exist_ok=True)
 
-        for t in self._db_types:
-            try:
-                rel_dir = os.path.join(self.secdb_dir_path, t)
-                os.makedirs(rel_dir, exist_ok=True)
-
-                filename = "security.json"
-                download_url = f"{self.metadata_url}/{t}/{filename}"
-
-                self.urls.append(download_url)
-
-                self.logger.info(f"downloading Wolfi secdb {download_url}")
-                r = requests.get(download_url, stream=True, timeout=self.download_timeout)
-                if r.status_code == 200:
-                    file_path = os.path.join(rel_dir, filename)
-                    with open(file_path, "wb") as fp:
-                        for chunk in r.iter_content():
-                            fp.write(chunk)
-                else:
-                    r.raise_for_status()
-            except:  # noqa
-                self.logger.exception(f"ignoring error processing secdb for {t}")
+        try:
+            self.logger.info(f"downloading {self.namespace} secdb {self.url}")
+            r = requests.get(self.url, stream=True, timeout=self.download_timeout)
+            if r.status_code == 200:
+                file_path = os.path.join(self.secdb_dir_path, self._db_filename)
+                with open(file_path, "wb") as fp:
+                    for chunk in r.iter_content():
+                        fp.write(chunk)
+            else:
+                r.raise_for_status()
+        except:  # noqa
+            self.logger.exception(f"ignoring error processing secdb for {self.url}")
 
     def _load(self):
         """
-        Loads all db json an yield it
+        Loads all db json and yields it
         :return:
         """
         dbtype_data_dict = {}
 
         # parse and transform the json
         try:
-            if os.path.exists(self.secdb_dir_path):
-                for s in glob.glob(f"{self.secdb_dir_path}/**/security.json", recursive=True):
-                    dbtype = s.split("/")[-2]
+            with open(f"{self.secdb_dir_path}/{self._db_filename}") as fh:
+                dbtype_data_dict = json.load(fh)
 
-                    if os.path.exists(s):
-                        self.logger.debug(f"loading secdb data from: {s}")
-                        with open(s, encoding="utf-8") as fh:
-                            dbtype_data_dict[dbtype] = json.load(fh)
-
-                yield "rolling", dbtype_data_dict
-            else:
-                raise Exception("Cannot find Wolfi sec db source ")
+                yield self._release_, dbtype_data_dict
         except Exception:
-            self.logger.exception("failed to load Wolfi sec db data")
+            self.logger.exception(f"failed to load {self.namespace} sec db data")
             raise
 
     # noqa
-    def _normalize(self, release, dbtype_data_dict):
+    def _normalize(self, release, data):
         """
         Normalize all the sec db entries into vulnerability payload records
         :param release:
@@ -97,52 +90,48 @@ def _normalize(self, release, dbtype_data_dict):
 
         vuln_dict = {}
 
-        for dbtype, data in dbtype_data_dict.items():
-            self.logger.debug(f"normalizing {release}:{dbtype}")
-
-            if not data["packages"]:
-                continue
-
-            for el in data["packages"]:
-                pkg_el = el["pkg"]
-
-                pkg = pkg_el["name"]
-                for pkg_version in pkg_el["secfixes"]:
-                    vids = []
-                    if pkg_el["secfixes"][pkg_version]:
-                        for rawvid in pkg_el["secfixes"][pkg_version]:
-                            tmp = rawvid.split()
-                            for newvid in tmp:
-                                if newvid not in vids:
-                                    vids.append(newvid)
-
-                    for vid in vids:
-                        if not re.match("^CVE-.*", vid):
-                            # skip non-CVE records
-                            continue
-
-                        if vid not in vuln_dict:
-                            # create a new record
-                            vuln_dict[vid] = copy.deepcopy(vulnerability.vulnerability_element)
-                            vuln_record = vuln_dict[vid]
-
-                            # populate the static information about the new vuln record
-                            vuln_record["Vulnerability"]["Name"] = str(vid)
-                            vuln_record["Vulnerability"]["NamespaceName"] = namespace + ":" + str(release)
-                            vuln_record["Vulnerability"]["Link"] = "http://cve.mitre.org/cgi-bin/cvename.cgi?name=" + str(vid)
-                            vuln_record["Vulnerability"]["Severity"] = "Unknown"
-                        else:
-                            vuln_record = vuln_dict[vid]
-
-                        # SET UP fixedins
-                        fixed_el = {
-                            "Name": pkg,
-                            "Version": pkg_version,
-                            "VersionFormat": "apk",
-                            "NamespaceName": namespace + ":" + str(release),
-                        }
-
-                        vuln_record["Vulnerability"]["FixedIn"].append(fixed_el)
+        self.logger.debug("normalizing vulnerability data")
+
+        for el in data["packages"]:
+            pkg_el = el["pkg"]
+
+            pkg = pkg_el["name"]
+            for pkg_version in pkg_el["secfixes"]:
+                vids = []
+                if pkg_el["secfixes"][pkg_version]:
+                    for rawvid in pkg_el["secfixes"][pkg_version]:
+                        tmp = rawvid.split()
+                        for newvid in tmp:
+                            if newvid not in vids:
+                                vids.append(newvid)
+
+                for vid in vids:
+                    if not re.match("^CVE-.*", vid):
+                        # skip non-CVE records
+                        continue
+
+                    if vid not in vuln_dict:
+                        # create a new record
+                        vuln_dict[vid] = copy.deepcopy(vulnerability.vulnerability_element)
+                        vuln_record = vuln_dict[vid]
+
+                        # populate the static information about the new vuln record
+                        vuln_record["Vulnerability"]["Name"] = str(vid)
+                        vuln_record["Vulnerability"]["NamespaceName"] = self.namespace + ":" + str(release)
+                        vuln_record["Vulnerability"]["Link"] = "http://cve.mitre.org/cgi-bin/cvename.cgi?name=" + str(vid)
+                        vuln_record["Vulnerability"]["Severity"] = "Unknown"
+                    else:
+                        vuln_record = vuln_dict[vid]
+
+                    # SET UP fixedins
+                    fixed_el = {
+                        "Name": pkg,
+                        "Version": pkg_version,
+                        "VersionFormat": "apk",
+                        "NamespaceName": self.namespace + ":" + str(release),
+                    }
+
+                    vuln_record["Vulnerability"]["FixedIn"].append(fixed_el)
 
         return vuln_dict
 

diff --git a/tests/quality/config.yaml b/tests/quality/config.yaml
@@ -14,7 +14,7 @@ yardstick:
       #  - "latest" to use the latest released grype
       #  - a released version name (e.g. "v0.52.1")
       #  - a branch name (e.g. "dev-fix-foo")
-      #  - a repo reference and optional "@branch" (e.g. "my-user-fork/grype@dev-fix-foo")
+      #  - a repo reference and optional "@branch" (e.g. "github.com/my-user-fork/grype@dev-fix-foo")
       # Note:
       #  - ALWAYS leave the "import-db" annotation as-is
       #  - this version should ALWAYS match that of the other "grype" tool below
@@ -26,7 +26,7 @@ yardstick:
       #  - "latest" to use the latest released grype
       #  - a released version name (e.g. "v0.52.1")
       #  - a branch name (e.g. "dev-fix-foo")
-      #  - a repo reference and optional "@branch" (e.g. "my-user-fork/grype@dev-fix-foo")
+      #  - a repo reference and optional "@branch" (e.g. "github.com/my-user-fork/grype@dev-fix-foo")
       # Note:
       #  - this version should ALWAYS match that of the other "grype" tool above
       version: latest
@@ -64,6 +64,16 @@ tests:
 #    images:
 #      - docker.io/centos:6@sha256:3688aa867eb84332460e172b9250c9c198fdfd8d987605fd53f246f498c60bcf
 
+  - provider: chainguard
+    additional_providers:
+      - name: nvd
+        use_cache: true
+    additional-trigger-globs:
+      # this provider imports and uses the wolfi provider code
+      - src/vunnel/providers/wolfi/**
+    images:
+      - ghcr.io/chainguard-images/scanner-test:latest@sha256:59bddc101fba0c45d5c093575c6bc5bfee7f0e46ff127e6bb4e5acaaafb525f9
+
   - provider: debian
     # ideally we would not use cache, however, the in order to test if we are properly keeping the processing
     # of legacy information that is in the debian data cache (for debian 7, 8, and 9) we must test with