Skip to content

Commit e43d686

Browse files
committed
Update commit parsing pipeline to support collecting fix commits from multiple repositories
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 857b080 commit e43d686

File tree

6 files changed

+220
-155
lines changed

6 files changed

+220
-155
lines changed

vulnerabilities/importers/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
from vulnerabilities.pipelines.v2_importers import apache_kafka_importer as apache_kafka_importer_v2
4747
from vulnerabilities.pipelines.v2_importers import apache_tomcat_importer as apache_tomcat_v2
4848
from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2
49+
from vulnerabilities.pipelines.v2_importers import collect_fix_commits as collect_fix_commits_v2
4950
from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2
5051
from vulnerabilities.pipelines.v2_importers import debian_importer as debian_importer_v2
5152
from vulnerabilities.pipelines.v2_importers import (
@@ -145,5 +146,19 @@
145146
ubuntu_usn.UbuntuUSNImporter,
146147
fireeye.FireyeImporter,
147148
oss_fuzz.OSSFuzzImporter,
149+
collect_fix_commits_v2.CollectNodejsFixCommitsPipeline,
150+
collect_fix_commits_v2.CollectCpythonFixCommitsPipeline,
151+
collect_fix_commits_v2.CollectGoFixCommitsPipeline,
152+
collect_fix_commits_v2.CollectRustFixCommitsPipeline,
153+
collect_fix_commits_v2.CollectPhpFixCommitsPipeline,
154+
collect_fix_commits_v2.CollectRubyFixCommitsPipeline,
155+
collect_fix_commits_v2.CollectNginxFixCommitsPipeline,
156+
collect_fix_commits_v2.CollectPostgresFixCommitsPipeline,
157+
collect_fix_commits_v2.CollectMysqlFixCommitsPipeline,
158+
collect_fix_commits_v2.CollectGitFixCommitsPipeline,
159+
collect_fix_commits_v2.CollectTensorflowFixCommitsPipeline,
160+
collect_fix_commits_v2.CollectFirefoxFixCommitsPipeline,
161+
collect_fix_commits_v2.CollectQEMUFixCommitsPipeline,
162+
collect_fix_commits_v2.CollectDenoFixCommitsPipeline,
148163
]
149164
)

vulnerabilities/pipelines/__init__.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
#
99

1010
import logging
11+
import re
12+
import shutil
13+
import tempfile
1114
import traceback
15+
from collections import defaultdict
1216
from datetime import datetime
1317
from datetime import timezone
1418
from timeit import default_timer as timer
@@ -19,8 +23,12 @@
1923
from aboutcode.pipeline import LoopProgress
2024
from aboutcode.pipeline import PipelineDefinition
2125
from aboutcode.pipeline import humanize_time
26+
from git import Repo
27+
from packageurl.contrib.url2purl import url2purl
2228

2329
from vulnerabilities.importer import AdvisoryData
30+
from vulnerabilities.importer import AffectedPackageV2
31+
from vulnerabilities.importer import PackageCommitPatchData
2432
from vulnerabilities.improver import MAX_CONFIDENCE
2533
from vulnerabilities.models import Advisory
2634
from vulnerabilities.models import PipelineRun
@@ -328,3 +336,109 @@ def collect_and_store_advisories(self):
328336
continue
329337

330338
self.log(f"Successfully collected {collected_advisory_count:,d} advisories")
339+
340+
341+
class CollectVCSFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2):
342+
"""
343+
Pipeline to collect fix commits from any git repository.
344+
"""
345+
346+
repo_url: str
347+
patterns: list[str] = [
348+
r"\bCVE-\d{4}-\d{4,19}\b",
349+
r"GHSA-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}",
350+
]
351+
352+
@classmethod
353+
def steps(cls):
354+
return (
355+
cls.clone,
356+
cls.collect_and_store_advisories,
357+
cls.clean_downloads,
358+
)
359+
360+
def clone(self):
361+
"""Clone the repository."""
362+
self.repo = Repo.clone_from(
363+
url=self.repo_url,
364+
to_path=tempfile.mkdtemp(),
365+
bare=True,
366+
no_checkout=True,
367+
multi_options=["--filter=blob:none"],
368+
)
369+
370+
def advisories_count(self) -> int:
371+
return 0
372+
373+
def extract_vulnerability_id(self, commit) -> list[str]:
374+
"""
375+
Extract vulnerability id from a commit message.
376+
Returns a list of matched vulnerability IDs
377+
"""
378+
matches = []
379+
for pattern in self.patterns:
380+
found = re.findall(pattern, commit.message, flags=re.IGNORECASE)
381+
matches.extend(found)
382+
return matches
383+
384+
def collect_fix_commits(self):
385+
"""
386+
Iterate through repository commits and group them by vulnerability identifiers.
387+
return a list with (vuln_id, [(commit_id, commit_message)]).
388+
"""
389+
self.log("Processing git repository fix commits (grouped by vulnerability IDs).")
390+
391+
grouped_commits = defaultdict(list)
392+
for commit in self.repo.iter_commits("--all"):
393+
matched_ids = self.extract_vulnerability_id(commit)
394+
if not matched_ids:
395+
continue
396+
397+
commit_id = commit.hexsha
398+
commit_message = commit.message.strip()
399+
400+
for vuln_id in matched_ids:
401+
grouped_commits[vuln_id].append((commit_id, commit_message))
402+
403+
self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.")
404+
self.log("Finished processing all commits.")
405+
return grouped_commits
406+
407+
def collect_advisories(self):
408+
"""
409+
Generate AdvisoryData objects for each vulnerability ID grouped with its related commits.
410+
"""
411+
self.log("Generating AdvisoryData objects from grouped commits.")
412+
grouped_commits = self.collect_fix_commits()
413+
purl = url2purl(self.repo_url)
414+
415+
for vuln_id, commits_data in grouped_commits.items():
416+
if not commits_data or not vuln_id:
417+
continue
418+
419+
commit_hash_set = {commit_hash for commit_hash, _ in commits_data}
420+
affected_packages = [
421+
AffectedPackageV2(
422+
package=purl,
423+
fixed_by_commit_patches=[
424+
PackageCommitPatchData(vcs_url=self.repo_url, commit_hash=commit_hash)
425+
for commit_hash in commit_hash_set
426+
],
427+
)
428+
]
429+
430+
yield AdvisoryData(
431+
advisory_id=vuln_id,
432+
affected_packages=affected_packages,
433+
url=self.repo_url,
434+
)
435+
436+
def clean_downloads(self):
437+
"""Cleanup any temporary repository data."""
438+
self.log("Cleaning up local repository resources.")
439+
if hasattr(self, "repo") and self.repo.working_dir:
440+
shutil.rmtree(path=self.repo.working_dir)
441+
442+
def on_failure(self):
443+
"""Ensure cleanup is always performed on failure."""
444+
self.clean_downloads()
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from vulnerabilities.pipelines import CollectVCSFixCommitPipeline
2+
3+
4+
class CollectNodejsFixCommitsPipeline(CollectVCSFixCommitPipeline):
5+
pipeline_id = "collect_nodejs_fix_commits"
6+
repo_url = "https://github.com/nodejs/node"
7+
8+
9+
class CollectCpythonFixCommitsPipeline(CollectVCSFixCommitPipeline):
10+
pipeline_id = "collect_cpython_fix_commits"
11+
repo_url = "https://github.com/python/cpython"
12+
13+
14+
class CollectGoFixCommitsPipeline(CollectVCSFixCommitPipeline):
15+
pipeline_id = "collect_go_fix_commits"
16+
repo_url = "https://github.com/golang/go"
17+
18+
19+
class CollectRustFixCommitsPipeline(CollectVCSFixCommitPipeline):
20+
pipeline_id = "collect_rust_lang_fix_commits"
21+
repo_url = "https://github.com/rust-lang/rust"
22+
23+
24+
class CollectPhpFixCommitsPipeline(CollectVCSFixCommitPipeline):
25+
pipeline_id = "collect_php_fix_commits"
26+
repo_url = "https://github.com/php/php-src"
27+
28+
29+
class CollectRubyFixCommitsPipeline(CollectVCSFixCommitPipeline):
30+
pipeline_id = "collect_ruby_fix_commits"
31+
repo_url = "https://github.com/ruby/ruby"
32+
33+
34+
class CollectNginxFixCommitsPipeline(CollectVCSFixCommitPipeline):
35+
pipeline_id = "collect_nginx_fix_commits"
36+
repo_url = "https://github.com/nginx/nginx"
37+
38+
39+
class CollectPostgresFixCommitsPipeline(CollectVCSFixCommitPipeline):
40+
pipeline_id = "collect_postgres_fix_commits"
41+
repo_url = "https://github.com/postgres/postgres"
42+
43+
44+
class CollectMysqlFixCommitsPipeline(CollectVCSFixCommitPipeline):
45+
pipeline_id = "collect_mysql_fix_commits"
46+
repo_url = "https://github.com/mysql/mysql-server"
47+
48+
49+
class CollectGitFixCommitsPipeline(CollectVCSFixCommitPipeline):
50+
pipeline_id = "collect_git_fix_commits"
51+
repo_url = "https://github.com/git/git"
52+
53+
54+
class CollectTensorflowFixCommitsPipeline(CollectVCSFixCommitPipeline):
55+
pipeline_id = "collect_tensorflow_fix_commits"
56+
repo_url = "https://github.com/tensorflow/tensorflow"
57+
58+
59+
class CollectFirefoxFixCommitsPipeline(CollectVCSFixCommitPipeline):
60+
pipeline_id = "collect_firefox_fix_commits"
61+
repo_url = "https://github.com/mozilla-firefox/firefox"
62+
63+
64+
class CollectQEMUFixCommitsPipeline(CollectVCSFixCommitPipeline):
65+
pipeline_id = "collect_qemu_fix_commits"
66+
repo_url = "https://github.com/qemu/qemu"
67+
68+
69+
class CollectDenoFixCommitsPipeline(CollectVCSFixCommitPipeline):
70+
pipeline_id = "collect_deno_fix_commits"
71+
repo_url = "https://github.com/denoland/deno"

vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py

Lines changed: 0 additions & 129 deletions
This file was deleted.

0 commit comments

Comments
 (0)