Skip to content

Commit 132e64b

Browse files
committed
Try to optimize the CollectReferencesFixCommitsPipeline pipeline
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 63e5078 commit 132e64b

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
from collections import defaultdict
11+
1012
from aboutcode.pipeline import LoopProgress
1113
from django.db.models import Prefetch
1214
from packageurl.contrib.purl2url import purl2url
@@ -65,7 +67,7 @@ def collect_and_store_fix_commits(self):
6567

6668
commit_batch = []
6769
updated_pkg_patch_commit_count = 0
68-
batch_size = 1000
70+
batch_size = 10000
6971
for adv in progress.iter(advisories.paginated(per_page=batch_size)):
7072
urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()}
7173

@@ -90,14 +92,22 @@ def bulk_commit_batch_update(self, vcs_data_table):
9092
impact_data = {(row[0], row[3]) for row in vcs_data_table} # base_purl, adv_id
9193
commit_data = {(row[1], row[2]) for row in vcs_data_table} # vcs_url, commit_hash
9294

93-
adv_ids = {aid for _, aid in impact_data}
94-
existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids)
95-
existing_impact_pairs = {(ip.base_purl, ip.advisory_id) for ip in existing_impacts}
95+
adv_ids = {adv_id for _, adv_id in impact_data}
96+
commit_hashes = {commit_hash for _, commit_hash in commit_data}
9697

97-
new_impacts = impact_data - existing_impact_pairs
98-
if new_impacts:
98+
existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only(
99+
"base_purl", "advisory_id"
100+
)
101+
existing_impact_pairs = {
102+
(impact_pkg.base_purl, impact_pkg.advisory_id) for impact_pkg in existing_impacts
103+
}
104+
105+
if new_impacts := impact_data - existing_impact_pairs:
99106
ImpactedPackage.objects.bulk_create(
100-
[ImpactedPackage(base_purl=bp, advisory_id=aid) for bp, aid in new_impacts]
107+
[
108+
ImpactedPackage(base_purl=base_purl, advisory_id=adv_id)
109+
for base_purl, adv_id in new_impacts
110+
]
101111
)
102112

103113
PackageCommitPatch.objects.bulk_create(
@@ -108,23 +118,28 @@ def bulk_commit_batch_update(self, vcs_data_table):
108118
ignore_conflicts=True,
109119
)
110120

111-
adv_ids = {adv_id for _, adv_id in impact_data}
112121
fetched_impacts = {
113122
(impacted_pkg.base_purl, impacted_pkg.advisory_id): impacted_pkg
114-
for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids)
123+
for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only(
124+
"base_purl", "advisory_id"
125+
)
115126
}
116127

117-
commit_hashes = {commit_hash for _, commit_hash in commit_data}
118-
fetched_commits = {
128+
fetched_pkg_commits = {
119129
(pkg_commit_patch.vcs_url, pkg_commit_patch.commit_hash): pkg_commit_patch
120-
for pkg_commit_patch in PackageCommitPatch.objects.filter(commit_hash__in=commit_hashes)
130+
for pkg_commit_patch in PackageCommitPatch.objects.filter(
131+
commit_hash__in=commit_hashes
132+
).only("vcs_url", "commit_hash")
121133
}
122134

135+
pkg_commit_add_impact_pkg = defaultdict(list)
123136
for base_purl, vcs_url, commit_hash, adv_id in vcs_data_table:
124-
impacted_package = fetched_impacts.get((base_purl, adv_id))
125-
package_commit_obj = fetched_commits.get((vcs_url, commit_hash))
137+
impacted_pkg_obj = fetched_impacts.get((base_purl, adv_id))
138+
pkg_commit_obj = fetched_pkg_commits.get((vcs_url, commit_hash))
139+
if impacted_pkg_obj and pkg_commit_obj:
140+
pkg_commit_add_impact_pkg[pkg_commit_obj].append(impacted_pkg_obj)
126141

127-
if impacted_package and package_commit_obj:
128-
package_commit_obj.fixed_in_impacts.add(impacted_package)
142+
for pkg_commit_obj, impact_pkgs in pkg_commit_add_impact_pkg.items():
143+
pkg_commit_obj.fixed_in_impacts.add(*impact_pkgs)
129144

130145
return len(vcs_data_table)

0 commit comments

Comments
 (0)